From 0a564b0785125990400718c5db9696af2f5a0c16 Mon Sep 17 00:00:00 2001 From: Aman-Hundal Date: Fri, 5 Jul 2024 14:27:58 -0700 Subject: [PATCH 1/3] Added logic to extract/burst out embedded file attachments --- .../DedupeServices/requirements.txt | Bin 794 -> 828 bytes .../services/s3documentservice.py | 73 +++++++++++++++--- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt index ca7e1b33cdcfb97590fa2882d3308273c2446589..8128a986fc4750d4708c541c96d43ee15247adc2 100644 GIT binary patch delta 41 ucmbQmwuf!QH73~rhDrushEj$A1{Vf523sIBWYA+UVlW1h7LzYBc>(~<#0bCu delta 11 ScmdnPHj8b;HKxfwm|OrIPXxRG diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py index 6f4d46047..7b1d06f09 100644 --- a/computingservices/DedupeServices/services/s3documentservice.py +++ b/computingservices/DedupeServices/services/s3documentservice.py @@ -13,6 +13,7 @@ import hashlib import uuid from re import sub +import fitz from utils import ( gets3credentialsobject, getdedupeproducermessage, @@ -49,6 +50,45 @@ def __getcredentialsbybcgovcode(bcgovcode): return s3cred +def _prepareattachment(producermessage, data, s3uripath, file_name): + attachment = { + "filename": escape(sub("<[0-9]+>", "", file_name, 1)), + "s3uripath": s3uripath, + "attributes": deepcopy(producermessage.attributes), + } + attachment["attributes"]["filesize"] = len(data) + attachment["attributes"][ + "parentpdfmasterid" + ] = producermessage.documentmasterid + attachment["attributes"].pop("batch") + attachment["attributes"].pop("extension") + attachment["attributes"].pop("incompatible") + return attachment + +def _generate_file_attachments(producermessage, reader, auth): + file_attachments = [] + for page in reader.pages: + if "/Annots" in page: + annotations = page["/Annots"] + for annotation in annotations: + subtype = annotation.get_object()["/Subtype"] + if subtype == "/FileAttachment": + producermessage.attributes["hasattachment"] = True + fileobj = annotation.get_object()["/FS"] + file = fileobj["/F"] + data = fileobj["/EF"]["/F"].get_data() + # data = BytesIO(data).getvalue() + s3uripath = ( + path.splitext(producermessage.s3filepath)[0] + + "/" + + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1]) + ) + uploadresponse = requests.put(s3uripath, data=data, auth=auth) + uploadresponse.raise_for_status() + attachment = _prepareattachment(producermessage, data, s3uripath, file) + file_attachments.append(attachment) + return file_attachments + def gets3documenthashcode(producermessage): s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode) s3_access_key_id = s3credentials.s3accesskey @@ -95,18 +135,7 @@ def gets3documenthashcode(producermessage): data = b"".join(reader.attachments[name]) uploadresponse = requests.put(s3uripath, data=data, auth=auth) uploadresponse.raise_for_status() - attachment = { - "filename": escape(sub("<[0-9]+>", "", name, 1)), - "s3uripath": s3uripath, - "attributes": deepcopy(producermessage.attributes), - } - attachment["attributes"]["filesize"] = len(data) - attachment["attributes"][ - "parentpdfmasterid" - ] = producermessage.documentmasterid - attachment["attributes"].pop("batch") - attachment["attributes"].pop("extension") - attachment["attributes"].pop("incompatible") + attachment = _prepareattachment(producermessage, data, s3uripath, name) attachments.append(attachment) saveresponse = requests.post( request_management_api @@ -119,6 +148,26 @@ def gets3documenthashcode(producermessage): }, ) saveresponse.raise_for_status() + + # New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF + # Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz) + fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf") + if (fitz_reader.has_annots()): + file_attachments = _generate_file_attachments(producermessage, reader, auth) + if (len(file_attachments) > 0): + saveresponse = requests.post( + request_management_api + + "/api/foirecord/-1/ministryrequest/" + + producermessage.ministryrequestid, + data=json.dumps({"records": file_attachments}), + headers={ + "Authorization": producermessage.usertoken, + "Content-Type": "application/json", + } + ) + saveresponse.raise_for_status() + fitz_reader.close() + elif extension.lower() in file_conversion_types: # "Extension different {0}, so need to download pdf here for pagecount!!".format(extension)) pdfresponseofconverted = requests.get( From 3eb45681a80ed0fc75c23ca959e1c3a24837d629 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 9 Jul 2024 09:37:34 -0700 Subject: [PATCH 2/3] fix image issues with msg conversion --- .../MCS.FOI.MSGToPDF/MSGFileProcessor.cs | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs index 5e6c443f0..b19bc60c7 100644 --- a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs +++ b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs @@ -153,8 +153,22 @@ public MSGFileProcessor(Stream sourceStream) } } var startAt = 0; - foreach (var inlineAttachment in inlineAttachments.OrderBy(m => m.GetType().GetProperty("RenderingPosition").GetValue(m, null))) + foreach (var inlineAttachment in inlineAttachments.OrderBy(m => { + int pos = (int) m.GetType().GetProperty("RenderingPosition").GetValue(m, null); + if (pos > -1) + { + return pos; + } + else + { + var _inlineAttachment = (Storage.Attachment)m; + Regex regex = new Regex(@""); + Match match = regex.Match(bodyreplaced, startAt); + return match.Index; + } + })) + { if (rtfInline) { if (!inlineAttachment.GetType().FullName.ToLower().Contains("message")) @@ -185,7 +199,7 @@ public MSGFileProcessor(Stream sourceStream) else if (htmlInline) { var _inlineAttachment = (Storage.Attachment)inlineAttachment; - Regex regex = new Regex(""); + Regex regex = new Regex(@""); Match match = regex.Match(bodyreplaced, startAt); if (match.Success) { @@ -218,7 +232,7 @@ public MSGFileProcessor(Stream sourceStream) heightString = " height =\"" + height + "\""; } string imgReplacementString = ""; - bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, 1, startAt); + bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, Int32.MaxValue, startAt); startAt = match.Index + imgReplacementString.Length; } foreach (KeyValuePair> attachment in attachmentsObj) From a8b9737c0a3be2df6ef1658963ad0276beedf8a2 Mon Sep 17 00:00:00 2001 From: Aman-Hundal Date: Tue, 16 Jul 2024 12:19:13 -0700 Subject: [PATCH 3/3] Added logic to generate an error during dedupe process when an attachment or embedded file is found in the PDF. This error is recorded to the dedupejob message and used in foi-flow to convey the error message to user --- api/reviewer_api/models/DeduplicationJob.py | 4 +- api/reviewer_api/services/documentservice.py | 1 + .../DedupeServices/services/dedupeservice.py | 2 +- .../services/s3documentservice.py | 40 +++++++++++-------- 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/api/reviewer_api/models/DeduplicationJob.py b/api/reviewer_api/models/DeduplicationJob.py index 81699e690..0258ed6c0 100644 --- a/api/reviewer_api/models/DeduplicationJob.py +++ b/api/reviewer_api/models/DeduplicationJob.py @@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid): executions = [] try: sql = """select distinct on (deduplicationjobid) deduplicationjobid, version, - filename, status, documentmasterid, trigger + filename, status, documentmasterid, trigger, message from "DeduplicationJob" fcj where ministryrequestid = :ministryrequestid order by deduplicationjobid, "version" desc""" rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid}) for row in rs: - executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]}) + executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]}) except Exception as ex: logging.error(ex) db.session.close() diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py index a4b23b6f4..301c09c1c 100644 --- a/api/reviewer_api/services/documentservice.py +++ b/api/reviewer_api/services/documentservice.py @@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record): record["deduplicationstatus"] = dedupe["status"] record["filename"] = dedupe["filename"] record["trigger"] = dedupe["trigger"] + record["message"] = dedupe["message"] return record def __updateproperties_old(self, properties, records, record): diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py index 0799beea4..a4af991b6 100644 --- a/computingservices/DedupeServices/services/dedupeservice.py +++ b/computingservices/DedupeServices/services/dedupeservice.py @@ -21,4 +21,4 @@ def processmessage(message): documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid) except(Exception) as error: print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error)) - recordjobend(message, True, traceback.format_exc()) \ No newline at end of file + recordjobend(message, True, error.args[0]) \ No newline at end of file diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py index 7b1d06f09..ff99843c9 100644 --- a/computingservices/DedupeServices/services/s3documentservice.py +++ b/computingservices/DedupeServices/services/s3documentservice.py @@ -72,21 +72,25 @@ def _generate_file_attachments(producermessage, reader, auth): annotations = page["/Annots"] for annotation in annotations: subtype = annotation.get_object()["/Subtype"] - if subtype == "/FileAttachment": - producermessage.attributes["hasattachment"] = True - fileobj = annotation.get_object()["/FS"] - file = fileobj["/F"] - data = fileobj["/EF"]["/F"].get_data() - # data = BytesIO(data).getvalue() - s3uripath = ( - path.splitext(producermessage.s3filepath)[0] - + "/" - + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1]) - ) - uploadresponse = requests.put(s3uripath, data=data, auth=auth) - uploadresponse.raise_for_status() - attachment = _prepareattachment(producermessage, data, s3uripath, file) - file_attachments.append(attachment) + if subtype == "/FileAttachment": + # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed. + raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced") + + # Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started. + # producermessage.attributes["hasattachment"] = True + # fileobj = annotation.get_object()["/FS"] + # file = fileobj["/F"] + # data = fileobj["/EF"]["/F"].get_data() + # # data = BytesIO(data).getvalue() + # s3uripath = ( + # path.splitext(producermessage.s3filepath)[0] + # + "/" + # + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1]) + # ) + # uploadresponse = requests.put(s3uripath, data=data, auth=auth) + # uploadresponse.raise_for_status() + # attachment = _prepareattachment(producermessage, data, s3uripath, file) + # file_attachments.append(attachment) return file_attachments def gets3documenthashcode(producermessage): @@ -125,7 +129,11 @@ def gets3documenthashcode(producermessage): if "/Collection" in reader.trailer["/Root"]: producermessage.attributes["isportfolio"] = True else: - producermessage.attributes["hasattachment"] = True + # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed. + raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced") + + # Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started. + # producermessage.attributes["hasattachment"] = True for name in reader.attachments: s3uripath = ( path.splitext(filepath)[0]