Merge pull request #1067 from bcgov/dev-rook-RQ-FOIMOD-2975-bugfix

Sync test and test-rook
bcgov · Jul 25, 2024 · 235ab5e · 235ab5e
2 parents c2ff9d9 + 8f50758
commit 235ab5e
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 19 deletions.
diff --git a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
@@ -153,8 +153,22 @@ public MSGFileProcessor(Stream sourceStream)
                                     }
                                 }
                                 var startAt = 0;
-                                foreach (var inlineAttachment in inlineAttachments.OrderBy(m => m.GetType().GetProperty("RenderingPosition").GetValue(m, null)))
+                                foreach (var inlineAttachment in inlineAttachments.OrderBy(m =>
                                 {
+                                    int pos = (int) m.GetType().GetProperty("RenderingPosition").GetValue(m, null);
+                                    if (pos > -1)
+                                    {
+                                        return pos;
+                                    }
+                                    else
+                                    {
+                                        var _inlineAttachment = (Storage.Attachment)m;
+                                        Regex regex = new Regex(@"<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
+                                        Match match = regex.Match(bodyreplaced, startAt);
+                                        return match.Index;
+                                    }
+                                }))
+                                {                                    
                                     if (rtfInline)
                                     {
                                         if (!inlineAttachment.GetType().FullName.ToLower().Contains("message"))
@@ -185,7 +199,7 @@ public MSGFileProcessor(Stream sourceStream)
                                     else if (htmlInline)
                                     {
                                         var _inlineAttachment = (Storage.Attachment)inlineAttachment;
-                                        Regex regex = new Regex("<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
+                                        Regex regex = new Regex(@"<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
                                         Match match = regex.Match(bodyreplaced, startAt);
                                         if (match.Success)
                                         {
@@ -218,7 +232,7 @@ public MSGFileProcessor(Stream sourceStream)
                                                 heightString = " height =\"" + height + "\"";
                                             }
                                             string imgReplacementString = "<img "+ widthString + heightString + " style =\"margin: 1px;\" src=\"data:" + _inlineAttachment.MimeType + ";base64," + Convert.ToBase64String(_inlineAttachment.Data) + "\"/>";
-                                            bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, 1, startAt);
+                                            bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, Int32.MaxValue, startAt);
                                             startAt = match.Index + imgReplacementString.Length;
                                         }
                                         foreach (KeyValuePair<MemoryStream, Dictionary<string, string>> attachment in attachmentsObj)

diff --git a/api/reviewer_api/models/DeduplicationJob.py b/api/reviewer_api/models/DeduplicationJob.py
@@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid):
         executions = []
         try:
             sql = """select distinct on (deduplicationjobid) deduplicationjobid, version, 
-                    filename, status, documentmasterid, trigger   
+                    filename, status, documentmasterid, trigger, message   
                     from "DeduplicationJob" fcj  where ministryrequestid = :ministryrequestid
                     order by deduplicationjobid, "version" desc"""
             rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
             for row in rs:
-                executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]})
+                executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]})
         except Exception as ex:
             logging.error(ex)
             db.session.close()

diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py
@@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record):
                 record["deduplicationstatus"] = dedupe["status"]
                 record["filename"] = dedupe["filename"]
                 record["trigger"] = dedupe["trigger"]
+                record["message"] = dedupe["message"]
         return record
 
     def __updateproperties_old(self, properties, records, record):

diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt
diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py
@@ -21,4 +21,4 @@ def processmessage(message):
             documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid)
     except(Exception) as error:
         print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error))
-        recordjobend(message, True, traceback.format_exc())
+        recordjobend(message, True, error.args[0])
diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
@@ -13,6 +13,7 @@
 import hashlib
 import uuid
 from re import sub
+import fitz
 from utils import (
     gets3credentialsobject,
     getdedupeproducermessage,
@@ -49,6 +50,49 @@ def __getcredentialsbybcgovcode(bcgovcode):
 
     return s3cred
 
+def _prepareattachment(producermessage, data, s3uripath, file_name):
+    attachment = {
+        "filename": escape(sub("<[0-9]+>", "", file_name, 1)),
+        "s3uripath": s3uripath,
+        "attributes": deepcopy(producermessage.attributes),
+    }
+    attachment["attributes"]["filesize"] = len(data)
+    attachment["attributes"][
+        "parentpdfmasterid"
+    ] = producermessage.documentmasterid
+    attachment["attributes"].pop("batch")
+    attachment["attributes"].pop("extension")
+    attachment["attributes"].pop("incompatible")
+    return attachment
+
+def _generate_file_attachments(producermessage, reader, auth):
+    file_attachments = []
+    for page in reader.pages:
+        if "/Annots" in page:
+            annotations = page["/Annots"]
+            for annotation in annotations:
+                subtype = annotation.get_object()["/Subtype"]
+                if subtype == "/FileAttachment":
+                    # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
+                    raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")
+
+                    # Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
+                    # producermessage.attributes["hasattachment"] = True
+                    # fileobj = annotation.get_object()["/FS"]
+                    # file = fileobj["/F"]
+                    # data = fileobj["/EF"]["/F"].get_data()
+                    # # data = BytesIO(data).getvalue()
+                    # s3uripath = (
+                    #     path.splitext(producermessage.s3filepath)[0]
+                    #     + "/"
+                    #     + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
+                    # )
+                    # uploadresponse = requests.put(s3uripath, data=data, auth=auth)
+                    # uploadresponse.raise_for_status()
+                    # attachment = _prepareattachment(producermessage, data, s3uripath, file)
+                    # file_attachments.append(attachment)
+    return file_attachments
+
 def gets3documenthashcode(producermessage):
     s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)    
     s3_access_key_id = s3credentials.s3accesskey
@@ -85,7 +129,11 @@ def gets3documenthashcode(producermessage):
             if "/Collection" in reader.trailer["/Root"]:
                 producermessage.attributes["isportfolio"] = True
             else:
-                producermessage.attributes["hasattachment"] = True
+                # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
+                raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")
+
+                # Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
+                # producermessage.attributes["hasattachment"] = True
             for name in reader.attachments:
                 s3uripath = (
                     path.splitext(filepath)[0]
@@ -95,18 +143,7 @@ def gets3documenthashcode(producermessage):
                 data = b"".join(reader.attachments[name])
                 uploadresponse = requests.put(s3uripath, data=data, auth=auth)
                 uploadresponse.raise_for_status()
-                attachment = {
-                    "filename": escape(sub("<[0-9]+>", "", name, 1)),
-                    "s3uripath": s3uripath,
-                    "attributes": deepcopy(producermessage.attributes),
-                }
-                attachment["attributes"]["filesize"] = len(data)
-                attachment["attributes"][
-                    "parentpdfmasterid"
-                ] = producermessage.documentmasterid
-                attachment["attributes"].pop("batch")
-                attachment["attributes"].pop("extension")
-                attachment["attributes"].pop("incompatible")
+                attachment = _prepareattachment(producermessage, data, s3uripath, name)
                 attachments.append(attachment)
             saveresponse = requests.post(
                 request_management_api
@@ -119,6 +156,26 @@ def gets3documenthashcode(producermessage):
                 },
             )
             saveresponse.raise_for_status()
+
+        # New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF
+        # Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz)
+        fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf")
+        if (fitz_reader.has_annots()):
+            file_attachments = _generate_file_attachments(producermessage, reader, auth)
+            if (len(file_attachments) > 0):
+                saveresponse = requests.post(
+                    request_management_api
+                    + "/api/foirecord/-1/ministryrequest/"
+                    + producermessage.ministryrequestid,
+                    data=json.dumps({"records": file_attachments}),
+                    headers={
+                        "Authorization": producermessage.usertoken,
+                        "Content-Type": "application/json",
+                    }
+                )
+                saveresponse.raise_for_status()
+        fitz_reader.close()
+
     elif extension.lower() in file_conversion_types:
         # "Extension different {0}, so need to download pdf here for pagecount!!".format(extension))
         pdfresponseofconverted = requests.get(