Skip to content

Commit

Permalink
Merge pull request #1058 from bcgov/dev
Browse files Browse the repository at this point in the history
Dev to Test (Tickets 2880 + 3232)
  • Loading branch information
Aman-Hundal authored Jul 22, 2024
2 parents c33bfc2 + 5540abd commit 36c6c5d
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 16 deletions.
4 changes: 2 additions & 2 deletions api/reviewer_api/models/DeduplicationJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ def getdedupestatus(cls, ministryrequestid):
executions = []
try:
sql = """select distinct on (deduplicationjobid) deduplicationjobid, version,
filename, status, documentmasterid, trigger
filename, status, documentmasterid, trigger, message
from "DeduplicationJob" fcj where ministryrequestid = :ministryrequestid
order by deduplicationjobid, "version" desc"""
rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
for row in rs:
executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"]})
executions.append({"deduplicationjobid": row["deduplicationjobid"], "version": row["version"], "filename": row["filename"], "status": row["status"], "documentmasterid": row["documentmasterid"], "trigger":row["trigger"], "message": row["message"]})
except Exception as ex:
logging.error(ex)
db.session.close()
Expand Down
1 change: 1 addition & 0 deletions api/reviewer_api/services/documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def __updatededupestatus(self, dedupes, record):
record["deduplicationstatus"] = dedupe["status"]
record["filename"] = dedupe["filename"]
record["trigger"] = dedupe["trigger"]
record["message"] = dedupe["message"]
return record

def __updateproperties_old(self, properties, records, record):
Expand Down
Binary file modified computingservices/DedupeServices/requirements.txt
Binary file not shown.
2 changes: 1 addition & 1 deletion computingservices/DedupeServices/services/dedupeservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ def processmessage(message):
documentspagecalculatorproducerservice().producepagecalculatorevent(pagecalculatormessage, _pagecount, pagecalculatorjobid)
except(Exception) as error:
print("Exception while processing redis message, func processmessage(p3), Error : {0} ".format(error))
recordjobend(message, True, traceback.format_exc())
recordjobend(message, True, error.args[0])
83 changes: 70 additions & 13 deletions computingservices/DedupeServices/services/s3documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import hashlib
import uuid
from re import sub
import fitz
from utils import (
gets3credentialsobject,
getdedupeproducermessage,
Expand Down Expand Up @@ -49,6 +50,49 @@ def __getcredentialsbybcgovcode(bcgovcode):

return s3cred

def _prepareattachment(producermessage, data, s3uripath, file_name):
attachment = {
"filename": escape(sub("<[0-9]+>", "", file_name, 1)),
"s3uripath": s3uripath,
"attributes": deepcopy(producermessage.attributes),
}
attachment["attributes"]["filesize"] = len(data)
attachment["attributes"][
"parentpdfmasterid"
] = producermessage.documentmasterid
attachment["attributes"].pop("batch")
attachment["attributes"].pop("extension")
attachment["attributes"].pop("incompatible")
return attachment

def _generate_file_attachments(producermessage, reader, auth):
file_attachments = []
for page in reader.pages:
if "/Annots" in page:
annotations = page["/Annots"]
for annotation in annotations:
subtype = annotation.get_object()["/Subtype"]
if subtype == "/FileAttachment":
# Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")

# Old logic to extract embedded files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
# producermessage.attributes["hasattachment"] = True
# fileobj = annotation.get_object()["/FS"]
# file = fileobj["/F"]
# data = fileobj["/EF"]["/F"].get_data()
# # data = BytesIO(data).getvalue()
# s3uripath = (
# path.splitext(producermessage.s3filepath)[0]
# + "/"
# + "{0}{1}".format(uuid.uuid4(), path.splitext(file)[1])
# )
# uploadresponse = requests.put(s3uripath, data=data, auth=auth)
# uploadresponse.raise_for_status()
# attachment = _prepareattachment(producermessage, data, s3uripath, file)
# file_attachments.append(attachment)
return file_attachments

def gets3documenthashcode(producermessage):
s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
s3_access_key_id = s3credentials.s3accesskey
Expand Down Expand Up @@ -85,7 +129,11 @@ def gets3documenthashcode(producermessage):
if "/Collection" in reader.trailer["/Root"]:
producermessage.attributes["isportfolio"] = True
else:
producermessage.attributes["hasattachment"] = True
# Placeholder logic to handle pdf attachments+embedds. Once resources available to revise feature, and extract attachments + embedds into one new parent PDF, this error handling will be removed.
raise Exception("PDF contains attachments and/or embedded files. File must be manually fixed and replaced")

# Old logic to extract attached files. Uncomment when new feature to save pdf embedds + attachemnts as one file is started.
# producermessage.attributes["hasattachment"] = True
for name in reader.attachments:
s3uripath = (
path.splitext(filepath)[0]
Expand All @@ -95,18 +143,7 @@ def gets3documenthashcode(producermessage):
data = b"".join(reader.attachments[name])
uploadresponse = requests.put(s3uripath, data=data, auth=auth)
uploadresponse.raise_for_status()
attachment = {
"filename": escape(sub("<[0-9]+>", "", name, 1)),
"s3uripath": s3uripath,
"attributes": deepcopy(producermessage.attributes),
}
attachment["attributes"]["filesize"] = len(data)
attachment["attributes"][
"parentpdfmasterid"
] = producermessage.documentmasterid
attachment["attributes"].pop("batch")
attachment["attributes"].pop("extension")
attachment["attributes"].pop("incompatible")
attachment = _prepareattachment(producermessage, data, s3uripath, name)
attachments.append(attachment)
saveresponse = requests.post(
request_management_api
Expand All @@ -119,6 +156,26 @@ def gets3documenthashcode(producermessage):
},
)
saveresponse.raise_for_status()

# New logic to extract embedded file attachments (classified under annotations in the PDF) from pages in PDF
# Before looping of pdf pages started; confirm if annotations exist in the pdf using pyMuPdf library (fitz)
fitz_reader = fitz.open(stream=BytesIO(response.content), filetype="pdf")
if (fitz_reader.has_annots()):
file_attachments = _generate_file_attachments(producermessage, reader, auth)
if (len(file_attachments) > 0):
saveresponse = requests.post(
request_management_api
+ "/api/foirecord/-1/ministryrequest/"
+ producermessage.ministryrequestid,
data=json.dumps({"records": file_attachments}),
headers={
"Authorization": producermessage.usertoken,
"Content-Type": "application/json",
}
)
saveresponse.raise_for_status()
fitz_reader.close()

elif extension.lower() in file_conversion_types:
# "Extension different {0}, so need to download pdf here for pagecount!!".format(extension))
pdfresponseofconverted = requests.get(
Expand Down

0 comments on commit 36c6c5d

Please sign in to comment.