APT_CyberCriminal_Campagin_.../index/generate_index.py
2022-07-13 11:37:19 +02:00

111 lines
3.4 KiB
Python
Executable File

import os
import re
import csv
import argparse
import logging
import hashlib
import PyPDF2
import ntpath
from PyPDF2.errors import PdfReadError
from urllib.parse import quote
from datetime import datetime
DATE_REGEX = re.compile("^\d{4}\.\d{2}\.\d{2}")
DOWN_BASE_PATH = (
"https://github.com/CyberMonitor/APT_CyberCriminal_Campagin_Collections/raw/master/"
)
processed_reports_list = []
def get_file_sha1_hash(path: str) -> str:
hash_sha1 = hashlib.sha1()
with open(path, "rb") as f:
buf = f.read()
hash_sha1.update(buf)
return hash_sha1.hexdigest()
def index_report(path: str):
checksum = get_file_sha1_hash(path)
down_path = DOWN_BASE_PATH + quote(path)
# Get the published date from the path name if possible
published_raw = DATE_REGEX.match(os.path.basename(os.path.dirname(path)))
pypdf = PyPDF2.PdfFileReader(open(path, "rb"), strict=False)
if published_raw == None or (".00" in published_raw.group(0)):
logging.debug(f"no published date for report: {path}")
if pypdf.isEncrypted:
pypdf.decrypt("")
try:
cdate_raw = pypdf.documentInfo["/CreationDate"]
cdate_clean = cdate_raw.replace("'", "").replace("D:", "")[:8]
cadate_parsed = datetime.strptime(cdate_clean, "%Y%m%d")
published = cadate_parsed.date()
except (KeyError, ValueError, PdfReadError) as derr:
logging.error(f"no date for report: {path} | {derr}")
return
else:
logging.debug(published_raw)
published = datetime.strptime(published_raw.group(0), "%Y.%m.%d").date()
title = ntpath.basename(path).replace(".pdf", "").replace(".PDF", "")
processed_reports_list.append((published, checksum, title, down_path))
def process_reports(path: str):
# Recurse the given path to find PDF reports
report_list = []
for path, subdirs, files in os.walk(path):
for filepath in files:
full_path = os.path.join(path, filepath)
if not filepath.endswith(".pdf"):
continue
try:
PyPDF2.PdfFileReader(open(full_path, "rb"))
rel_dir = os.path.relpath(path, os.getcwd())
rel_file = os.path.join(rel_dir, filepath)
report_list.append(rel_file)
except Exception as perr:
logging.debug(f"invalid or not a PDF file: {full_path} {perr}")
continue
logging.debug(f"processing {full_path}")
for rep in report_list:
logging.debug(f"processing {rep}")
index_report(rep)
with open("index.csv", "w", newline="") as csvfile:
sorted_reports = sorted(processed_reports_list, key=lambda x: x[0])
fieldnames = ["Published", "SHA-1", "Filename", "Download URL"]
indexwriter = csv.writer(csvfile, dialect="excel")
indexwriter.writerow(fieldnames)
indexwriter.writerows(sorted_reports)
# ARGPARSE
arg_parser = argparse.ArgumentParser(description="Index documents in Repository")
arg_parser.add_argument(
"-p", "--path", help="Path to the document repository", default=os.getcwd()
)
arg_parser.add_argument(
"-d", "--debug", action="store_true", help="print debug messages"
)
args = arg_parser.parse_args()
def main():
if args.debug:
logging.basicConfig(level=logging.DEBUG)
process_reports(args.path)
if __name__ == "__main__":
main()