From 0a564b0785125990400718c5db9696af2f5a0c16 Mon Sep 17 00:00:00 2001
From: Aman-Hundal <aman.hundal@gov.bc.ca>
Date: Fri, 5 Jul 2024 14:27:58 -0700
Subject: [PATCH 1/3] Added logic to extract/burst out embedded file
 attachments

---
 .../DedupeServices/requirements.txt           | Bin 794 -> 828 bytes
 .../services/s3documentservice.py             |  73 +++++++++++++++---
 2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt
index ca7e1b33cdcfb97590fa2882d3308273c2446589..8128a986fc4750d4708c541c96d43ee15247adc2 100644
GIT binary patch
delta 41
ucmbQmwuf!QH73~rhDrushEj$A1{Vf523sIBWYA+UVlW1h7LzYBc>(~<#0bCu

delta 11
ScmdnPHj8b;HKxfwm|OrIPXxRG

diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
index 6f4d46047..7b1d06f09 100644
--- a/computingservices/DedupeServices/services/s3documentservice.py
+++ b/computingservices/DedupeServices/services/s3documentservice.py
@@ -13,6 +13,7 @@
 import hashlib
 import uuid
 from re import sub
+import fitz
 from utils import (
     gets3credentialsobject,
     getdedupeproducermessage,
@@ -49,6 +50,45 @@ def __getcredentialsbybcgovcode(bcgovcode):
 
     return s3cred
 
+def _prepareattachment(producermessage, data, s3uripath, file_name):
+    attachment = {
+        "filename": escape(sub("<[0-9]+>", "", file_name, 1)),
+        "s3uripath": s3uripath,
+        "attributes": deepcopy(producermessage.attributes),
+    }
+    attachment["attributes"]["filesize"] = len(data)
+    attachment["attributes"][
+        "parentpdfmasterid"
+    ] = producermessage.documentmasterid
+    attachment["attributes"].pop("batch")
+    attachment["attributes"].pop("extension")
+    attachment["attributes"].pop("incompatible")
+    return attachment
+
+def _generate_file_attachments(producermessage, reader, auth):
+    file_attachments = []
+    for page in reader.pages:
+        if "/Annots" in page:
+            annotations = page["/Annots"]
+            for annotation in annotations:
+                subtype = annotation.get_object()["/Subtype"]
+                if subtype == "/FileAttachment": 
+                    producermessage.attributes["hasattachment"] = True
+                    fileobj = annotation.get_object()["/FS"]
+                    file = fileobj["/F"]
+                    data = fileobj["/EF"]["/F"].get_data()
+                    # data = BytesIO(data).getvalue()
+                    s3uripath = (
+                        path.splitext(producermessage.s3filepath)[0]
+                        + "/"
+                        + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
+                    )
+                    uploadresponse = requests.put(s3uripath, data=data, auth=auth)
+                    uploadresponse.raise_for_status()
+                    attachment = _prepareattachment(producermessage, data, s3uripath, file)
+                    file_attachments.append(attachment)
+    return file_attachments
+
 def gets3documenthashcode(producermessage):
     s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)    
     s3_access_key_id = s3credentials.s3accesskey
@@ -95,18 +135,7 @@ def gets3documenthashcode(producermessage):
                 data = b"".join(reader.attachments[name])
                 uploadresponse = requests.put(s3uripath, data=data, auth=auth)
                 uploadresponse.raise_for_status()
-                attachment = {
-                    "filename": escape(sub("<[0-9]+>", "", name, 1)),
-                    "s3uripath": s3uripath,
-                    "attributes": deepcopy(producermessage.attributes),
-                }
-                attachment["attributes"]["filesize"] = len(data)
-                attachment["attributes"][
-                    "parentpdfmasterid"
-                ] = producermessage.documentmasterid
-                attachment["attributes"].pop("batch")
-                attachment["attributes"].pop("extension")
-                attachment["attributes"].pop("incompatible")
+                attachment = _prepareattachment(producermessage, data, s3uripath, name)
                 attachments.append(attachment)
             saveresponse = requests.post(
                 request_management_api
@@ -119,6 +148,26 @@ def gets3documenthashcode(producermessage):
                 },
             )
             saveresponse.raise_for_status()
+
+        # New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF
+        # Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz)
+        fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf")
+        if (fitz_reader.has_annots()):
+            file_attachments = _generate_file_attachments(producermessage, reader, auth)
+            if (len(file_attachments) > 0):
+                saveresponse = requests.post(
+                    request_management_api
+                    + "/api/foirecord/-1/ministryrequest/"
+                    + producermessage.ministryrequestid,
+                    data=json.dumps({"records": file_attachments}),
+                    headers={
+                        "Authorization": producermessage.usertoken,
+                        "Content-Type": "application/json",
+                    }
+                )
+                saveresponse.raise_for_status()
+        fitz_reader.close()
+        
     elif extension.lower() in file_conversion_types:
         # "Extension different {0}, so need to download pdf here for pagecount!!".format(extension))
         pdfresponseofconverted = requests.get(

From 3eb45681a80ed0fc75c23ca959e1c3a24837d629 Mon Sep 17 00:00:00 2001
From: unknown <nicholas.kan@aot-technologies.com>
Date: Tue, 9 Jul 2024 09:37:34 -0700
Subject: [PATCH 2/3] fix image issues with msg conversion

---
 .../MCS.FOI.MSGToPDF/MSGFileProcessor.cs      | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
index 5e6c443f0..b19bc60c7 100644
--- a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
+++ b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
@@ -153,8 +153,22 @@ public MSGFileProcessor(Stream sourceStream)
                                     }
                                 }
                                 var startAt = 0;
-                                foreach (var inlineAttachment in inlineAttachments.OrderBy(m => m.GetType().GetProperty("RenderingPosition").GetValue(m, null)))
+                                foreach (var inlineAttachment in inlineAttachments.OrderBy(m =>
                                 {
+                                    int pos = (int) m.GetType().GetProperty("RenderingPosition").GetValue(m, null);
+                                    if (pos > -1)
+                                    {
+                                        return pos;
+                                    }
+                                    else
+                                    {
+                                        var _inlineAttachment = (Storage.Attachment)m;
+                                        Regex regex = new Regex(@"<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
+                                        Match match = regex.Match(bodyreplaced, startAt);
+                                        return match.Index;
+                                    }
+                                }))
+                                {                                    
                                     if (rtfInline)
                                     {
                                         if (!inlineAttachment.GetType().FullName.ToLower().Contains("message"))
@@ -185,7 +199,7 @@ public MSGFileProcessor(Stream sourceStream)
                                     else if (htmlInline)
                                     {
                                         var _inlineAttachment = (Storage.Attachment)inlineAttachment;
-                                        Regex regex = new Regex("<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
+                                        Regex regex = new Regex(@"<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
                                         Match match = regex.Match(bodyreplaced, startAt);
                                         if (match.Success)
                                         {
@@ -218,7 +232,7 @@ public MSGFileProcessor(Stream sourceStream)
                                                 heightString = " height =\"" + height + "\"";
                                             }
                                             string imgReplacementString = "<img "+ widthString + heightString + " style =\"margin: 1px;\" src=\"data:" + _inlineAttachment.MimeType + ";base64," + Convert.ToBase64String(_inlineAttachment.Data) + "\"/>";
-                                            bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, 1, startAt);
+                                            bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, Int32.MaxValue, startAt);
                                             startAt = match.Index + imgReplacementString.Length;
                                         }
                                         foreach (KeyValuePair<MemoryStream, Dictionary<string, string>> attachment in attachmentsObj)

From a8b9737c0a3be2df6ef1658963ad0276beedf8a2 Mon Sep 17 00:00:00 2001
From: Aman-Hundal <aman.hundal@gov.bc.ca>
Date: Tue, 16 Jul 2024 12:19:13 -0700
Subject: [PATCH 3/3] Added logic to generate an error during dedupe process
 when an attachment or embedded file is found in the PDF. This error is
 recorded to the dedupejob message and used in foi-flow to convey the error
 message to user

---
 api/reviewer_api/models/DeduplicationJob.py   |  4 +-
 api/reviewer_api/services/documentservice.py  |  1 +
 .../DedupeServices/services/dedupeservice.py  |  2 +-
 .../services/s3documentservice.py             | 40 +++++++++++--------
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/api/reviewer_api/models/DeduplicationJob.py b/api/reviewer_api/models/DeduplicationJob.py
index 81699e690..0258ed6c0 100644
--- a/api/reviewer_api/models/DeduplicationJob.py
+++ b/api/reviewer_api/models/DeduplicationJob.py
@@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid):
         executions = []
         try:
             sql = """select distinct on (deduplicationjobid) deduplicationjobid, version, 
-                    filename, status, documentmasterid, trigger   
+                    filename, status, documentmasterid, trigger, message   
                     from "DeduplicationJob" fcj  where ministryrequestid = :ministryrequestid
                     order by deduplicationjobid, "version" desc"""
             rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
             for row in rs:
-                executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]})
+                executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]})
         except Exception as ex:
             logging.error(ex)
             db.session.close()
diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py
index a4b23b6f4..301c09c1c 100644
--- a/api/reviewer_api/services/documentservice.py
+++ b/api/reviewer_api/services/documentservice.py
@@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record):
                 record["deduplicationstatus"] = dedupe["status"]
                 record["filename"] = dedupe["filename"]
                 record["trigger"] = dedupe["trigger"]
+                record["message"] = dedupe["message"]
         return record
 
     def __updateproperties_old(self, properties, records, record):
diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py
index 0799beea4..a4af991b6 100644
--- a/computingservices/DedupeServices/services/dedupeservice.py
+++ b/computingservices/DedupeServices/services/dedupeservice.py
@@ -21,4 +21,4 @@ def processmessage(message):
             documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid)
     except(Exception) as error:
         print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error))
-        recordjobend(message, True, traceback.format_exc())
\ No newline at end of file
+        recordjobend(message, True, error.args[0])
\ No newline at end of file
diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
index 7b1d06f09..ff99843c9 100644
--- a/computingservices/DedupeServices/services/s3documentservice.py
+++ b/computingservices/DedupeServices/services/s3documentservice.py
@@ -72,21 +72,25 @@ def _generate_file_attachments(producermessage, reader, auth):
             annotations = page["/Annots"]
             for annotation in annotations:
                 subtype = annotation.get_object()["/Subtype"]
-                if subtype == "/FileAttachment": 
-                    producermessage.attributes["hasattachment"] = True
-                    fileobj = annotation.get_object()["/FS"]
-                    file = fileobj["/F"]
-                    data = fileobj["/EF"]["/F"].get_data()
-                    # data = BytesIO(data).getvalue()
-                    s3uripath = (
-                        path.splitext(producermessage.s3filepath)[0]
-                        + "/"
-                        + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
-                    )
-                    uploadresponse = requests.put(s3uripath, data=data, auth=auth)
-                    uploadresponse.raise_for_status()
-                    attachment = _prepareattachment(producermessage, data, s3uripath, file)
-                    file_attachments.append(attachment)
+                if subtype == "/FileAttachment":
+                    # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
+                    raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")
+
+                    # Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
+                    # producermessage.attributes["hasattachment"] = True
+                    # fileobj = annotation.get_object()["/FS"]
+                    # file = fileobj["/F"]
+                    # data = fileobj["/EF"]["/F"].get_data()
+                    # # data = BytesIO(data).getvalue()
+                    # s3uripath = (
+                    #     path.splitext(producermessage.s3filepath)[0]
+                    #     + "/"
+                    #     + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
+                    # )
+                    # uploadresponse = requests.put(s3uripath, data=data, auth=auth)
+                    # uploadresponse.raise_for_status()
+                    # attachment = _prepareattachment(producermessage, data, s3uripath, file)
+                    # file_attachments.append(attachment)
     return file_attachments
 
 def gets3documenthashcode(producermessage):
@@ -125,7 +129,11 @@ def gets3documenthashcode(producermessage):
             if "/Collection" in reader.trailer["/Root"]:
                 producermessage.attributes["isportfolio"] = True
             else:
-                producermessage.attributes["hasattachment"] = True
+                # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
+                raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")
+            
+                # Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
+                # producermessage.attributes["hasattachment"] = True
             for name in reader.attachments:
                 s3uripath = (
                     path.splitext(filepath)[0]