Skip to content

Commit

Permalink
Merge pull request #1067 from bcgov/dev-rook-RQ-FOIMOD-2975-bugfix
Browse files Browse the repository at this point in the history
Sync test and test-rook
  • Loading branch information
richard-aot authored Jul 25, 2024
2 parents c2ff9d9 + 8f50758 commit 235ab5e
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 19 deletions.
20 changes: 17 additions & 3 deletions MCS.FOI.S3FileConversion/MCS.FOI.MSGToPDF/MSGFileProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,22 @@ public MSGFileProcessor(Stream sourceStream)
}
}
var startAt = 0;
foreach (var inlineAttachment in inlineAttachments.OrderBy(m => m.GetType().GetProperty("RenderingPosition").GetValue(m, null)))
foreach (var inlineAttachment in inlineAttachments.OrderBy(m =>
{
int pos = (int) m.GetType().GetProperty("RenderingPosition").GetValue(m, null);
if (pos > -1)
{
return pos;
}
else
{
var _inlineAttachment = (Storage.Attachment)m;
Regex regex = new Regex(@"<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
Match match = regex.Match(bodyreplaced, startAt);
return match.Index;
}
}))
{
if (rtfInline)
{
if (!inlineAttachment.GetType().FullName.ToLower().Contains("message"))
Expand Down Expand Up @@ -185,7 +199,7 @@ public MSGFileProcessor(Stream sourceStream)
else if (htmlInline)
{
var _inlineAttachment = (Storage.Attachment)inlineAttachment;
Regex regex = new Regex("<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
Regex regex = new Regex(@"<img(.|\\n)*cid:" + _inlineAttachment.ContentId + "(.|\\n)*?>");
Match match = regex.Match(bodyreplaced, startAt);
if (match.Success)
{
Expand Down Expand Up @@ -218,7 +232,7 @@ public MSGFileProcessor(Stream sourceStream)
heightString = " height =\"" + height + "\"";
}
string imgReplacementString = "<img "+ widthString + heightString + " style =\"margin: 1px;\" src=\"data:" + _inlineAttachment.MimeType + ";base64," + Convert.ToBase64String(_inlineAttachment.Data) + "\"/>";
bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, 1, startAt);
bodyreplaced = regex.Replace(bodyreplaced, imgReplacementString, Int32.MaxValue, startAt);
startAt = match.Index + imgReplacementString.Length;
}
foreach (KeyValuePair<MemoryStream, Dictionary<string, string>> attachment in attachmentsObj)
Expand Down
4 changes: 2 additions & 2 deletions api/reviewer_api/models/DeduplicationJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid):
executions = []
try:
sql = """select distinct on (deduplicationjobid) deduplicationjobid, version,
filename, status, documentmasterid, trigger
filename, status, documentmasterid, trigger, message
from "DeduplicationJob" fcj where ministryrequestid = :ministryrequestid
order by deduplicationjobid, "version" desc"""
rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
for row in rs:
executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]})
executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]})
except Exception as ex:
logging.error(ex)
db.session.close()
Expand Down
1 change: 1 addition & 0 deletions api/reviewer_api/services/documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record):
record["deduplicationstatus"] = dedupe["status"]
record["filename"] = dedupe["filename"]
record["trigger"] = dedupe["trigger"]
record["message"] = dedupe["message"]
return record

def __updateproperties_old(self, properties, records, record):
Expand Down
Binary file modified computingservices/DedupeServices/requirements.txt
Binary file not shown.
2 changes: 1 addition & 1 deletion computingservices/DedupeServices/services/dedupeservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ def processmessage(message):
documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid)
except(Exception) as error:
print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error))
recordjobend(message, True, traceback.format_exc())
recordjobend(message, True, error.args[0])
83 changes: 70 additions & 13 deletions computingservices/DedupeServices/services/s3documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import hashlib
import uuid
from re import sub
import fitz
from utils import (
gets3credentialsobject,
getdedupeproducermessage,
Expand Down Expand Up @@ -49,6 +50,49 @@ def __getcredentialsbybcgovcode(bcgovcode):

return s3cred

def _prepareattachment(producermessage, data, s3uripath, file_name):
attachment = {
"filename": escape(sub("<[0-9]+>", "", file_name, 1)),
"s3uripath": s3uripath,
"attributes": deepcopy(producermessage.attributes),
}
attachment["attributes"]["filesize"] = len(data)
attachment["attributes"][
"parentpdfmasterid"
] = producermessage.documentmasterid
attachment["attributes"].pop("batch")
attachment["attributes"].pop("extension")
attachment["attributes"].pop("incompatible")
return attachment

def _generate_file_attachments(producermessage, reader, auth):
file_attachments = []
for page in reader.pages:
if "/Annots" in page:
annotations = page["/Annots"]
for annotation in annotations:
subtype = annotation.get_object()["/Subtype"]
if subtype == "/FileAttachment":
# Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")

# Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
# producermessage.attributes["hasattachment"] = True
# fileobj = annotation.get_object()["/FS"]
# file = fileobj["/F"]
# data = fileobj["/EF"]["/F"].get_data()
# # data = BytesIO(data).getvalue()
# s3uripath = (
# path.splitext(producermessage.s3filepath)[0]
# + "/"
# + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
# )
# uploadresponse = requests.put(s3uripath, data=data, auth=auth)
# uploadresponse.raise_for_status()
# attachment = _prepareattachment(producermessage, data, s3uripath, file)
# file_attachments.append(attachment)
return file_attachments

def gets3documenthashcode(producermessage):
s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
s3_access_key_id = s3credentials.s3accesskey
Expand Down Expand Up @@ -85,7 +129,11 @@ def gets3documenthashcode(producermessage):
if "/Collection" in reader.trailer["/Root"]:
producermessage.attributes["isportfolio"] = True
else:
producermessage.attributes["hasattachment"] = True
# Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")

# Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
# producermessage.attributes["hasattachment"] = True
for name in reader.attachments:
s3uripath = (
path.splitext(filepath)[0]
Expand All @@ -95,18 +143,7 @@ def gets3documenthashcode(producermessage):
data = b"".join(reader.attachments[name])
uploadresponse = requests.put(s3uripath, data=data, auth=auth)
uploadresponse.raise_for_status()
attachment = {
"filename": escape(sub("<[0-9]+>", "", name, 1)),
"s3uripath": s3uripath,
"attributes": deepcopy(producermessage.attributes),
}
attachment["attributes"]["filesize"] = len(data)
attachment["attributes"][
"parentpdfmasterid"
] = producermessage.documentmasterid
attachment["attributes"].pop("batch")
attachment["attributes"].pop("extension")
attachment["attributes"].pop("incompatible")
attachment = _prepareattachment(producermessage, data, s3uripath, name)
attachments.append(attachment)
saveresponse = requests.post(
request_management_api
Expand All @@ -119,6 +156,26 @@ def gets3documenthashcode(producermessage):
},
)
saveresponse.raise_for_status()

# New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF
# Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz)
fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf")
if (fitz_reader.has_annots()):
file_attachments = _generate_file_attachments(producermessage, reader, auth)
if (len(file_attachments) > 0):
saveresponse = requests.post(
request_management_api
+ "/api/foirecord/-1/ministryrequest/"
+ producermessage.ministryrequestid,
data=json.dumps({"records": file_attachments}),
headers={
"Authorization": producermessage.usertoken,
"Content-Type": "application/json",
}
)
saveresponse.raise_for_status()
fitz_reader.close()

elif extension.lower() in file_conversion_types:
# "Extension different {0}, so need to download pdf here for pagecount!!".format(extension))
pdfresponseofconverted = requests.get(
Expand Down

0 comments on commit 235ab5e

Please sign in to comment.