Fixes for PYPDF2

This commit is contained in:
Robert Haist 2022-12-27 17:57:51 +01:00
parent 72b0d84034
commit e4cb9cc9aa
2 changed files with 1541 additions and 7 deletions

1533
index.csv

File diff suppressed because it is too large Load Diff

View File

@ -33,18 +33,19 @@ def index_report(path: str):
# Get the published date from the path name if possible
published_raw = DATE_REGEX.match(os.path.basename(os.path.dirname(path)))
pypdf = PyPDF2.PdfFileReader(open(path, "rb"), strict=False)
pypdf = PyPDF2.PdfReader(open(path, "rb"), strict=False)
if published_raw == None or (".00" in published_raw.group(0)):
logging.debug(f"no published date for report: {path}")
if pypdf.isEncrypted:
if pypdf.is_encrypted:
pypdf.decrypt("")
try:
cdate_raw = pypdf.documentInfo["/CreationDate"]
cdate_clean = cdate_raw.replace("'", "").replace("D:", "")[:8]
cadate_parsed = datetime.strptime(cdate_clean, "%Y%m%d")
published = cadate_parsed.date()
cdate = pypdf.metadata.creation_date
if cdate != None:
published = pypdf.metadata.creation_date.date()
else:
published = datetime.min.date()
except (KeyError, ValueError, PdfReadError) as derr:
logging.error(f"no date for report: {path} | {derr}")
return
@ -67,7 +68,7 @@ def process_reports(path: str):
if not filepath.endswith(".pdf"):
continue
try:
PyPDF2.PdfFileReader(open(full_path, "rb"))
PyPDF2.PdfReader(open(full_path, "rb"))
rel_dir = os.path.relpath(path, os.getcwd())
rel_file = os.path.join(rel_dir, filepath)
report_list.append(rel_file)