mirror of
https://github.com/CyberMonitor/APT_CyberCriminal_Campagin_Collections
synced 2024-06-28 09:51:38 +00:00
Merge pull request #66 from rhaist/rha/add-index
Add report index csv and python generator code
This commit is contained in:
commit
d812bd0fd3
Binary file not shown.
Binary file not shown.
110
index/generate_index.py
Normal file
110
index/generate_index.py
Normal file
@ -0,0 +1,110 @@
|
||||
import os
|
||||
import re
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
import hashlib
|
||||
import PyPDF2
|
||||
import ntpath
|
||||
|
||||
from PyPDF2.utils import PdfReadError
|
||||
from urllib.parse import quote
|
||||
from datetime import datetime
|
||||
|
||||
DATE_REGEX = re.compile("^\d{4}\.\d{2}\.\d{2}")
|
||||
DOWN_BASE_PATH = (
|
||||
"https://github.com/CyberMonitor/APT_CyberCriminal_Campagin_Collections/raw/master/"
|
||||
)
|
||||
|
||||
processed_reports_list = []
|
||||
|
||||
|
||||
def get_file_sha1_hash(path: str) -> str:
|
||||
hash_sha1 = hashlib.sha1()
|
||||
with open(path, "rb") as f:
|
||||
buf = f.read()
|
||||
hash_sha1.update(buf)
|
||||
return hash_sha1.hexdigest()
|
||||
|
||||
|
||||
def index_report(path: str):
|
||||
checksum = get_file_sha1_hash(path)
|
||||
down_path = DOWN_BASE_PATH + quote(path)
|
||||
|
||||
# Get the published date from the path name if possible
|
||||
published_raw = DATE_REGEX.match(os.path.basename(os.path.dirname(path)))
|
||||
pypdf = PyPDF2.PdfFileReader(open(path, "rb"), strict=False)
|
||||
|
||||
if published_raw == None or (".00" in published_raw.group(0)):
|
||||
logging.debug(f"no published date for report: {path}")
|
||||
|
||||
if pypdf.isEncrypted:
|
||||
pypdf.decrypt("")
|
||||
try:
|
||||
cdate_raw = pypdf.documentInfo["/CreationDate"]
|
||||
cdate_clean = cdate_raw.replace("'", "").replace("D:", "")[:8]
|
||||
cadate_parsed = datetime.strptime(cdate_clean, "%Y%m%d")
|
||||
published = cadate_parsed.date()
|
||||
except (KeyError, ValueError, PdfReadError) as derr:
|
||||
logging.error(f"no date for report: {path} | {derr}")
|
||||
return
|
||||
else:
|
||||
logging.debug(published_raw)
|
||||
published = datetime.strptime(published_raw.group(0), "%Y.%m.%d").date()
|
||||
|
||||
title = ntpath.basename(path).replace(".pdf", "").replace(".PDF", "")
|
||||
|
||||
processed_reports_list.append((published, checksum, title, down_path))
|
||||
|
||||
|
||||
def process_reports(path: str):
|
||||
# Recurse the given path to find PDF reports
|
||||
report_list = []
|
||||
|
||||
for path, subdirs, files in os.walk(path):
|
||||
for filepath in files:
|
||||
full_path = os.path.join(path, filepath)
|
||||
if not filepath.endswith(".pdf"):
|
||||
continue
|
||||
try:
|
||||
PyPDF2.PdfFileReader(open(full_path, "rb"))
|
||||
rel_dir = os.path.relpath(path, os.getcwd())
|
||||
rel_file = os.path.join(rel_dir, filepath)
|
||||
report_list.append(rel_file)
|
||||
except Exception as perr:
|
||||
logging.debug(f"invalid or not a PDF file: {full_path} {perr}")
|
||||
continue
|
||||
logging.debug(f"processing {full_path}")
|
||||
|
||||
for rep in report_list:
|
||||
logging.debug(f"processing {rep}")
|
||||
index_report(rep)
|
||||
|
||||
with open("index.csv", "w", newline="") as csvfile:
|
||||
sorted_reports = sorted(processed_reports_list, key=lambda x: x[0])
|
||||
fieldnames = ["Published", "SHA-1", "Filename", "Download URL"]
|
||||
indexwriter = csv.writer(csvfile, dialect="excel")
|
||||
indexwriter.writerow(fieldnames)
|
||||
indexwriter.writerows(sorted_reports)
|
||||
|
||||
|
||||
# ARGPARSE
|
||||
arg_parser = argparse.ArgumentParser(description="Index documents in Repository")
|
||||
arg_parser.add_argument(
|
||||
"-p", "--path", help="Path to the document repository", default=os.getcwd()
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"-d", "--debug", action="store_true", help="print debug messages"
|
||||
)
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
if args.debug:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
process_reports(args.path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
index/requirements.txt
Normal file
1
index/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
PyPDF2
|
Loading…
Reference in New Issue
Block a user