From 51b1029ae0292d1030ab2064a3bc5395f23565d3 Mon Sep 17 00:00:00 2001 From: Aman-Hundal Date: Fri, 3 May 2024 11:02:00 -0700 Subject: [PATCH] Added logic to extract out file attachments / embedded files from pdf pages during the dedupe computing service --- .../DedupeServices/requirements.txt | Bin 794 -> 828 bytes .../services/s3documentservice.py | 73 +++++++++++++++--- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt index ca7e1b33cdcfb97590fa2882d3308273c2446589..8401abd88215e6a22eb1d28edb0e8204dbdc711a 100644 GIT binary patch delta 42 tcmbQmwufzl6tfa90~bR8LnVVRLn)AUVQ^!x1wumxJq9BNV<2h4005{)266xZ delta 7 OcmdnPHj8b86f*z{umUgu diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py index 6f4d46047..7b1d06f09 100644 --- a/computingservices/DedupeServices/services/s3documentservice.py +++ b/computingservices/DedupeServices/services/s3documentservice.py @@ -13,6 +13,7 @@ import hashlib import uuid from re import sub +import fitz from utils import ( gets3credentialsobject, getdedupeproducermessage, @@ -49,6 +50,45 @@ def __getcredentialsbybcgovcode(bcgovcode): return s3cred +def _prepareattachment(producermessage, data, s3uripath, file_name): + attachment = { + "filename": escape(sub("<[0-9]+>", "", file_name, 1)), + "s3uripath": s3uripath, + "attributes": deepcopy(producermessage.attributes), + } + attachment["attributes"]["filesize"] = len(data) + attachment["attributes"][ + "parentpdfmasterid" + ] = producermessage.documentmasterid + attachment["attributes"].pop("batch") + attachment["attributes"].pop("extension") + attachment["attributes"].pop("incompatible") + return attachment + +def _generate_file_attachments(producermessage, reader, auth): + file_attachments = [] + for page in reader.pages: + if "/Annots" in page: + annotations = page["/Annots"] + for annotation in annotations: + subtype = annotation.get_object()["/Subtype"] + if subtype == "/FileAttachment": + producermessage.attributes["hasattachment"] = True + fileobj = annotation.get_object()["/FS"] + file = fileobj["/F"] + data = fileobj["/EF"]["/F"].get_data() + # data = BytesIO(data).getvalue() + s3uripath = ( + path.splitext(producermessage.s3filepath)[0] + + "/" + + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1]) + ) + uploadresponse = requests.put(s3uripath, data=data, auth=auth) + uploadresponse.raise_for_status() + attachment = _prepareattachment(producermessage, data, s3uripath, file) + file_attachments.append(attachment) + return file_attachments + def gets3documenthashcode(producermessage): s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode) s3_access_key_id = s3credentials.s3accesskey @@ -95,18 +135,7 @@ def gets3documenthashcode(producermessage): data = b"".join(reader.attachments[name]) uploadresponse = requests.put(s3uripath, data=data, auth=auth) uploadresponse.raise_for_status() - attachment = { - "filename": escape(sub("<[0-9]+>", "", name, 1)), - "s3uripath": s3uripath, - "attributes": deepcopy(producermessage.attributes), - } - attachment["attributes"]["filesize"] = len(data) - attachment["attributes"][ - "parentpdfmasterid" - ] = producermessage.documentmasterid - attachment["attributes"].pop("batch") - attachment["attributes"].pop("extension") - attachment["attributes"].pop("incompatible") + attachment = _prepareattachment(producermessage, data, s3uripath, name) attachments.append(attachment) saveresponse = requests.post( request_management_api @@ -119,6 +148,26 @@ def gets3documenthashcode(producermessage): }, ) saveresponse.raise_for_status() + + # New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF + # Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz) + fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf") + if (fitz_reader.has_annots()): + file_attachments = _generate_file_attachments(producermessage, reader, auth) + if (len(file_attachments) > 0): + saveresponse = requests.post( + request_management_api + + "/api/foirecord/-1/ministryrequest/" + + producermessage.ministryrequestid, + data=json.dumps({"records": file_attachments}), + headers={ + "Authorization": producermessage.usertoken, + "Content-Type": "application/json", + } + ) + saveresponse.raise_for_status() + fitz_reader.close() + elif extension.lower() in file_conversion_types: # "Extension different {0}, so need to download pdf here for pagecount!!".format(extension)) pdfresponseofconverted = requests.get(