diff --git a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs index 5e6c443f0..b19bc60c7 100644 --- a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs +++ b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs @@ -153,8 +153,22 @@ public MSGFileProcessor(Stream sourceStream) } } var startAt = 0; - foreach (var inlineAttachment in inlineAttachments.OrderBy(m => m.GetType().GetProperty("RenderingPosition").GetValue(m, null))) + foreach (var inlineAttachment in inlineAttachments.OrderBy(m => { + int pos = (int) m.GetType().GetProperty("RenderingPosition").GetValue(m, null); + if (pos > -1) + { + return pos; + } + else + { + var _inlineAttachment = (Storage.Attachment)m; + Regex regex = new Regex(@""); + Match match = regex.Match(bodyreplaced, startAt); + return match.Index; + } + })) + { if (rtfInline) { if (!inlineAttachment.GetType().FullName.ToLower().Contains("message")) @@ -185,7 +199,7 @@ public MSGFileProcessor(Stream sourceStream) else if (htmlInline) { var _inlineAttachment = (Storage.Attachment)inlineAttachment; - Regex regex = new Regex(""); + Regex regex = new Regex(@""); Match match = regex.Match(bodyreplaced, startAt); if (match.Success) { @@ -218,7 +232,7 @@ public MSGFileProcessor(Stream sourceStream) heightString = " height =\"" + height + "\""; } string imgReplacementString = ""; - bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, 1, startAt); + bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, Int32.MaxValue, startAt); startAt = match.Index + imgReplacementString.Length; } foreach (KeyValuePair> attachment in attachmentsObj) diff --git a/api/reviewer_api/models/DeduplicationJob.py b/api/reviewer_api/models/DeduplicationJob.py index 81699e690..0258ed6c0 100644 --- a/api/reviewer_api/models/DeduplicationJob.py +++ b/api/reviewer_api/models/DeduplicationJob.py @@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid): executions = [] try: sql = """select distinct on (deduplicationjobid) deduplicationjobid, version, - filename, status, documentmasterid, trigger + filename, status, documentmasterid, trigger, message from "DeduplicationJob" fcj where ministryrequestid = :ministryrequestid order by deduplicationjobid, "version" desc""" rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid}) for row in rs: - executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]}) + executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]}) except Exception as ex: logging.error(ex) db.session.close() diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py index 33170c4ba..0f5ad7734 100644 --- a/api/reviewer_api/services/documentservice.py +++ b/api/reviewer_api/services/documentservice.py @@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record): record["deduplicationstatus"] = dedupe["status"] record["filename"] = dedupe["filename"] record["trigger"] = dedupe["trigger"] + record["message"] = dedupe["message"] return record def __updateproperties_old(self, properties, records, record): diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt index ca7e1b33c..8128a986f 100644 Binary files a/computingservices/DedupeServices/requirements.txt and b/computingservices/DedupeServices/requirements.txt differ diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py index 0799beea4..a4af991b6 100644 --- a/computingservices/DedupeServices/services/dedupeservice.py +++ b/computingservices/DedupeServices/services/dedupeservice.py @@ -21,4 +21,4 @@ def processmessage(message): documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid) except(Exception) as error: print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error)) - recordjobend(message, True, traceback.format_exc()) \ No newline at end of file + recordjobend(message, True, error.args[0]) \ No newline at end of file diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py index 6f4d46047..ff99843c9 100644 --- a/computingservices/DedupeServices/services/s3documentservice.py +++ b/computingservices/DedupeServices/services/s3documentservice.py @@ -13,6 +13,7 @@ import hashlib import uuid from re import sub +import fitz from utils import ( gets3credentialsobject, getdedupeproducermessage, @@ -49,6 +50,49 @@ def __getcredentialsbybcgovcode(bcgovcode): return s3cred +def _prepareattachment(producermessage, data, s3uripath, file_name): + attachment = { + "filename": escape(sub("<[0-9]+>", "", file_name, 1)), + "s3uripath": s3uripath, + "attributes": deepcopy(producermessage.attributes), + } + attachment["attributes"]["filesize"] = len(data) + attachment["attributes"][ + "parentpdfmasterid" + ] = producermessage.documentmasterid + attachment["attributes"].pop("batch") + attachment["attributes"].pop("extension") + attachment["attributes"].pop("incompatible") + return attachment + +def _generate_file_attachments(producermessage, reader, auth): + file_attachments = [] + for page in reader.pages: + if "/Annots" in page: + annotations = page["/Annots"] + for annotation in annotations: + subtype = annotation.get_object()["/Subtype"] + if subtype == "/FileAttachment": + # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed. + raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced") + + # Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started. + # producermessage.attributes["hasattachment"] = True + # fileobj = annotation.get_object()["/FS"] + # file = fileobj["/F"] + # data = fileobj["/EF"]["/F"].get_data() + # # data = BytesIO(data).getvalue() + # s3uripath = ( + # path.splitext(producermessage.s3filepath)[0] + # + "/" + # + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1]) + # ) + # uploadresponse = requests.put(s3uripath, data=data, auth=auth) + # uploadresponse.raise_for_status() + # attachment = _prepareattachment(producermessage, data, s3uripath, file) + # file_attachments.append(attachment) + return file_attachments + def gets3documenthashcode(producermessage): s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode) s3_access_key_id = s3credentials.s3accesskey @@ -85,7 +129,11 @@ def gets3documenthashcode(producermessage): if "/Collection" in reader.trailer["/Root"]: producermessage.attributes["isportfolio"] = True else: - producermessage.attributes["hasattachment"] = True + # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed. + raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced") + + # Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started. + # producermessage.attributes["hasattachment"] = True for name in reader.attachments: s3uripath = ( path.splitext(filepath)[0] @@ -95,18 +143,7 @@ def gets3documenthashcode(producermessage): data = b"".join(reader.attachments[name]) uploadresponse = requests.put(s3uripath, data=data, auth=auth) uploadresponse.raise_for_status() - attachment = { - "filename": escape(sub("<[0-9]+>", "", name, 1)), - "s3uripath": s3uripath, - "attributes": deepcopy(producermessage.attributes), - } - attachment["attributes"]["filesize"] = len(data) - attachment["attributes"][ - "parentpdfmasterid" - ] = producermessage.documentmasterid - attachment["attributes"].pop("batch") - attachment["attributes"].pop("extension") - attachment["attributes"].pop("incompatible") + attachment = _prepareattachment(producermessage, data, s3uripath, name) attachments.append(attachment) saveresponse = requests.post( request_management_api @@ -119,6 +156,26 @@ def gets3documenthashcode(producermessage): }, ) saveresponse.raise_for_status() + + # New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF + # Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz) + fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf") + if (fitz_reader.has_annots()): + file_attachments = _generate_file_attachments(producermessage, reader, auth) + if (len(file_attachments) > 0): + saveresponse = requests.post( + request_management_api + + "/api/foirecord/-1/ministryrequest/" + + producermessage.ministryrequestid, + data=json.dumps({"records": file_attachments}), + headers={ + "Authorization": producermessage.usertoken, + "Content-Type": "application/json", + } + ) + saveresponse.raise_for_status() + fitz_reader.close() + elif extension.lower() in file_conversion_types: # "Extension different {0}, so need to download pdf here for pagecount!!".format(extension)) pdfresponseofconverted = requests.get(