diff --git a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
index 5e6c443f0..b19bc60c7 100644
--- a/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
+++ b/MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
@@ -153,8 +153,22 @@ public MSGFileProcessor(Stream sourceStream)
}
}
var startAt = 0;
- foreach (var inlineAttachment in inlineAttachments.OrderBy(m => m.GetType().GetProperty("RenderingPosition").GetValue(m, null)))
+ foreach (var inlineAttachment in inlineAttachments.OrderBy(m =>
{
+ int pos = (int) m.GetType().GetProperty("RenderingPosition").GetValue(m, null);
+ if (pos > -1)
+ {
+ return pos;
+ }
+ else
+ {
+ var _inlineAttachment = (Storage.Attachment)m;
+ Regex regex = new Regex(@"");
+ Match match = regex.Match(bodyreplaced, startAt);
+ return match.Index;
+ }
+ }))
+ {
if (rtfInline)
{
if (!inlineAttachment.GetType().FullName.ToLower().Contains("message"))
@@ -185,7 +199,7 @@ public MSGFileProcessor(Stream sourceStream)
else if (htmlInline)
{
var _inlineAttachment = (Storage.Attachment)inlineAttachment;
- Regex regex = new Regex("");
+ Regex regex = new Regex(@"");
Match match = regex.Match(bodyreplaced, startAt);
if (match.Success)
{
@@ -218,7 +232,7 @@ public MSGFileProcessor(Stream sourceStream)
heightString = " height =\"" + height + "\"";
}
string imgReplacementString = "";
- bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, 1, startAt);
+ bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, Int32.MaxValue, startAt);
startAt = match.Index + imgReplacementString.Length;
}
foreach (KeyValuePair> attachment in attachmentsObj)
diff --git a/api/reviewer_api/models/DeduplicationJob.py b/api/reviewer_api/models/DeduplicationJob.py
index 81699e690..0258ed6c0 100644
--- a/api/reviewer_api/models/DeduplicationJob.py
+++ b/api/reviewer_api/models/DeduplicationJob.py
@@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid):
executions = []
try:
sql = """select distinct on (deduplicationjobid) deduplicationjobid, version,
- filename, status, documentmasterid, trigger
+ filename, status, documentmasterid, trigger, message
from "DeduplicationJob" fcj where ministryrequestid = :ministryrequestid
order by deduplicationjobid, "version" desc"""
rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
for row in rs:
- executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]})
+ executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]})
except Exception as ex:
logging.error(ex)
db.session.close()
diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py
index 33170c4ba..0f5ad7734 100644
--- a/api/reviewer_api/services/documentservice.py
+++ b/api/reviewer_api/services/documentservice.py
@@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record):
record["deduplicationstatus"] = dedupe["status"]
record["filename"] = dedupe["filename"]
record["trigger"] = dedupe["trigger"]
+ record["message"] = dedupe["message"]
return record
def __updateproperties_old(self, properties, records, record):
diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt
index ca7e1b33c..8128a986f 100644
Binary files a/computingservices/DedupeServices/requirements.txt and b/computingservices/DedupeServices/requirements.txt differ
diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py
index 0799beea4..a4af991b6 100644
--- a/computingservices/DedupeServices/services/dedupeservice.py
+++ b/computingservices/DedupeServices/services/dedupeservice.py
@@ -21,4 +21,4 @@ def processmessage(message):
documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid)
except(Exception) as error:
print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error))
- recordjobend(message, True, traceback.format_exc())
\ No newline at end of file
+ recordjobend(message, True, error.args[0])
\ No newline at end of file
diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
index 6f4d46047..ff99843c9 100644
--- a/computingservices/DedupeServices/services/s3documentservice.py
+++ b/computingservices/DedupeServices/services/s3documentservice.py
@@ -13,6 +13,7 @@
import hashlib
import uuid
from re import sub
+import fitz
from utils import (
gets3credentialsobject,
getdedupeproducermessage,
@@ -49,6 +50,49 @@ def __getcredentialsbybcgovcode(bcgovcode):
return s3cred
+def _prepareattachment(producermessage, data, s3uripath, file_name):
+ attachment = {
+ "filename": escape(sub("<[0-9]+>", "", file_name, 1)),
+ "s3uripath": s3uripath,
+ "attributes": deepcopy(producermessage.attributes),
+ }
+ attachment["attributes"]["filesize"] = len(data)
+ attachment["attributes"][
+ "parentpdfmasterid"
+ ] = producermessage.documentmasterid
+ attachment["attributes"].pop("batch")
+ attachment["attributes"].pop("extension")
+ attachment["attributes"].pop("incompatible")
+ return attachment
+
+def _generate_file_attachments(producermessage, reader, auth):
+ file_attachments = []
+ for page in reader.pages:
+ if "/Annots" in page:
+ annotations = page["/Annots"]
+ for annotation in annotations:
+ subtype = annotation.get_object()["/Subtype"]
+ if subtype == "/FileAttachment":
+ # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
+ raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")
+
+ # Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
+ # producermessage.attributes["hasattachment"] = True
+ # fileobj = annotation.get_object()["/FS"]
+ # file = fileobj["/F"]
+ # data = fileobj["/EF"]["/F"].get_data()
+ # # data = BytesIO(data).getvalue()
+ # s3uripath = (
+ # path.splitext(producermessage.s3filepath)[0]
+ # + "/"
+ # + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
+ # )
+ # uploadresponse = requests.put(s3uripath, data=data, auth=auth)
+ # uploadresponse.raise_for_status()
+ # attachment = _prepareattachment(producermessage, data, s3uripath, file)
+ # file_attachments.append(attachment)
+ return file_attachments
+
def gets3documenthashcode(producermessage):
s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
s3_access_key_id = s3credentials.s3accesskey
@@ -85,7 +129,11 @@ def gets3documenthashcode(producermessage):
if "/Collection" in reader.trailer["/Root"]:
producermessage.attributes["isportfolio"] = True
else:
- producermessage.attributes["hasattachment"] = True
+ # Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
+ raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")
+
+ # Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
+ # producermessage.attributes["hasattachment"] = True
for name in reader.attachments:
s3uripath = (
path.splitext(filepath)[0]
@@ -95,18 +143,7 @@ def gets3documenthashcode(producermessage):
data = b"".join(reader.attachments[name])
uploadresponse = requests.put(s3uripath, data=data, auth=auth)
uploadresponse.raise_for_status()
- attachment = {
- "filename": escape(sub("<[0-9]+>", "", name, 1)),
- "s3uripath": s3uripath,
- "attributes": deepcopy(producermessage.attributes),
- }
- attachment["attributes"]["filesize"] = len(data)
- attachment["attributes"][
- "parentpdfmasterid"
- ] = producermessage.documentmasterid
- attachment["attributes"].pop("batch")
- attachment["attributes"].pop("extension")
- attachment["attributes"].pop("incompatible")
+ attachment = _prepareattachment(producermessage, data, s3uripath, name)
attachments.append(attachment)
saveresponse = requests.post(
request_management_api
@@ -119,6 +156,26 @@ def gets3documenthashcode(producermessage):
},
)
saveresponse.raise_for_status()
+
+ # New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF
+ # Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz)
+ fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf")
+ if (fitz_reader.has_annots()):
+ file_attachments = _generate_file_attachments(producermessage, reader, auth)
+ if (len(file_attachments) > 0):
+ saveresponse = requests.post(
+ request_management_api
+ + "/api/foirecord/-1/ministryrequest/"
+ + producermessage.ministryrequestid,
+ data=json.dumps({"records": file_attachments}),
+ headers={
+ "Authorization": producermessage.usertoken,
+ "Content-Type": "application/json",
+ }
+ )
+ saveresponse.raise_for_status()
+ fitz_reader.close()
+
elif extension.lower() in file_conversion_types:
# "Extension different {0}, so need to download pdf here for pagecount!!".format(extension))
pdfresponseofconverted = requests.get(