Merge pull request #66 from rhaist/rha/add-index

Add report index csv and python generator code
2024-06-28 09:51:38 +00:00 · 2021-08-04 10:18:13 +08:00 · 2021-08-04 10:18:13 +08:00 · d812bd0fd3
commit d812bd0fd3
parent c1e2b47947 5f6912e7d2
5 changed files with 1515 additions and 0 deletions
--- a/2013/2013.00.00.Dark_Seoul_Cyber_Attack/Dark_Seoul_Cyberattack.pdf
+++ b/2013/2013.00.00.Dark_Seoul_Cyber_Attack/Dark_Seoul_Cyberattack.pdf
--- a/2013/2013.00.00.Energy_at_Risk/energy-at-risk.pdf
+++ b/2013/2013.00.00.Energy_at_Risk/energy-at-risk.pdf
--- a/index.csv
+++ b/index.csv
--- a/index/generate_index.py
+++ b/index/generate_index.py
@ -0,0 +1,110 @@
+import os
+import re
+import csv
+import argparse
+import logging
+import hashlib
+import PyPDF2
+import ntpath
+
+from PyPDF2.utils import PdfReadError
+from urllib.parse import quote
+from datetime import datetime
+
+DATE_REGEX = re.compile("^\d{4}\.\d{2}\.\d{2}")
+DOWN_BASE_PATH = (
+    "https://github.com/CyberMonitor/APT_CyberCriminal_Campagin_Collections/raw/master/"
+)
+
+processed_reports_list = []
+
+
+def get_file_sha1_hash(path: str) -> str:
+    hash_sha1 = hashlib.sha1()
+    with open(path, "rb") as f:
+        buf = f.read()
+        hash_sha1.update(buf)
+    return hash_sha1.hexdigest()
+
+
+def index_report(path: str):
+    checksum = get_file_sha1_hash(path)
+    down_path = DOWN_BASE_PATH + quote(path)
+
+    # Get the published date from the path name if possible
+    published_raw = DATE_REGEX.match(os.path.basename(os.path.dirname(path)))
+    pypdf = PyPDF2.PdfFileReader(open(path, "rb"), strict=False)
+
+    if published_raw == None or (".00" in published_raw.group(0)):
+        logging.debug(f"no published date for report: {path}")
+
+        if pypdf.isEncrypted:
+            pypdf.decrypt("")
+        try:
+            cdate_raw = pypdf.documentInfo["/CreationDate"]
+            cdate_clean = cdate_raw.replace("'", "").replace("D:", "")[:8]
+            cadate_parsed = datetime.strptime(cdate_clean, "%Y%m%d")
+            published = cadate_parsed.date()
+        except (KeyError, ValueError, PdfReadError) as derr:
+            logging.error(f"no date for report: {path} | {derr}")
+            return
+    else:
+        logging.debug(published_raw)
+        published = datetime.strptime(published_raw.group(0), "%Y.%m.%d").date()
+
+    title = ntpath.basename(path).replace(".pdf", "").replace(".PDF", "")
+
+    processed_reports_list.append((published, checksum, title, down_path))
+
+
+def process_reports(path: str):
+    # Recurse the given path to find PDF reports
+    report_list = []
+
+    for path, subdirs, files in os.walk(path):
+        for filepath in files:
+            full_path = os.path.join(path, filepath)
+            if not filepath.endswith(".pdf"):
+                continue
+            try:
+                PyPDF2.PdfFileReader(open(full_path, "rb"))
+                rel_dir = os.path.relpath(path, os.getcwd())
+                rel_file = os.path.join(rel_dir, filepath)
+                report_list.append(rel_file)
+            except Exception as perr:
+                logging.debug(f"invalid or not a PDF file: {full_path} {perr}")
+                continue
+            logging.debug(f"processing {full_path}")
+
+    for rep in report_list:
+        logging.debug(f"processing {rep}")
+        index_report(rep)
+
+    with open("index.csv", "w", newline="") as csvfile:
+        sorted_reports = sorted(processed_reports_list, key=lambda x: x[0])
+        fieldnames = ["Published", "SHA-1", "Filename", "Download URL"]
+        indexwriter = csv.writer(csvfile, dialect="excel")
+        indexwriter.writerow(fieldnames)
+        indexwriter.writerows(sorted_reports)
+
+
+# ARGPARSE
+arg_parser = argparse.ArgumentParser(description="Index documents in Repository")
+arg_parser.add_argument(
+    "-p", "--path", help="Path to the document repository", default=os.getcwd()
+)
+arg_parser.add_argument(
+    "-d", "--debug", action="store_true", help="print debug messages"
+)
+args = arg_parser.parse_args()
+
+
+def main():
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+
+    process_reports(args.path)
+
+
+if __name__ == "__main__":
+    main()
--- a/index/requirements.txt
+++ b/index/requirements.txt
@ -0,0 +1 @@
+PyPDF2