Skip to content

Commit

Permalink
Merge pull request #665 from bcgov/dev-DV-4708
Browse files Browse the repository at this point in the history
Flatten PDF - Dedupe, web, api changes
  • Loading branch information
divyav-aot authored Dec 20, 2023
2 parents a48441e + f8a1fe0 commit e5860a8
Show file tree
Hide file tree
Showing 13 changed files with 226 additions and 45 deletions.
4 changes: 2 additions & 2 deletions api/reviewer_api/models/DocumentMaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,15 @@ def getdocumentproperty(cls, ministryrequestid, deleted):
documentmasters = []
try:
sql = """select dm.documentmasterid, dm.processingparentid, d.documentid, d.version,
dhc.rank1hash, d.filename, d.pagecount, dm.parentid from "DocumentMaster" dm,
dhc.rank1hash, d.filename, d.pagecount, d.attributes, dm.parentid from "DocumentMaster" dm,
"Documents" d, "DocumentHashCodes" dhc
where dm.ministryrequestid = :ministryrequestid and dm.ministryrequestid = d.foiministryrequestid
and dm.documentmasterid = d.documentmasterid
and d.documentid = dhc.documentid order by dm.documentmasterid;"""
rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
for row in rs:
if (row["processingparentid"] is not None and row["processingparentid"] not in deleted) or (row["processingparentid"] is None and row["documentmasterid"] not in deleted):
documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "parentid": row["parentid"], "version": row["version"]})
documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "documentattribute": row["attributes"], "parentid": row["parentid"], "version": row["version"]})
except Exception as ex:
logging.error(ex)
db.session.close()
Expand Down
20 changes: 12 additions & 8 deletions api/reviewer_api/resources/foiflowmasterdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from reviewer_api.services.radactionservice import redactionservice
from reviewer_api.services.documentservice import documentservice
from reviewer_api.utils.constants import FILE_CONVERSION_FILE_TYPES

API = Namespace(
"FOI Flow Master Data", description="Endpoints for FOI Flow master data"
Expand Down Expand Up @@ -128,11 +129,14 @@ def post():
)

documentobjs = []
documentids = [documentinfo["file"]["documentid"] for documentinfo in data["documentobjs"]]
documents = documentservice().getdocumentbyids(documentids)
for documentinfo in data["documentobjs"]:
filepath = "/".join(documents[documentinfo["file"]["documentid"]].split("/")[4:])
filepath = "/".join(documentinfo["file"]["filepath"].split("/")[4:])
if documentinfo["file"]["processedfilepath"]:
filepath = "/".join(documentinfo["file"]["processedfilepath"].split("/")[4:])
filename, file_extension = os.path.splitext(filepath)
if file_extension in FILE_CONVERSION_FILE_TYPES:
filepath = filename + ".pdf"

documentinfo["s3url"] = s3client.generate_presigned_url(
ClientMethod="get_object",
Params={
Expand Down Expand Up @@ -297,8 +301,6 @@ def post(ministryrequestid):
filepathlist[0], division_name
)

# filename_put, file_extension_put = os.path.splitext(filepath_put)
# filepath_put = filename_put+'.pdf'
s3path_save = s3client.generate_presigned_url(
ClientMethod="get_object",
Params={
Expand All @@ -313,11 +315,13 @@ def post(ministryrequestid):
# for save/put - stitch by division
div["s3path_save"] = s3path_save
for doc in div["documentlist"]:
realfilepath = documentservice().getfilepathbydocumentid(doc["documentid"])
# filepathlist = doc["filepath"].split("/")[4:]
filepathlist = realfilepath.split("/")[4:]
filepathlist = doc["filepath"].split("/")[4:]
if doc["processedfilepath"]:
filepathlist = doc["processedfilepath"].split("/")[4:]

# for load/get
filepath_get = "/".join(filepathlist)

filename_get, file_extension_get = os.path.splitext(
filepath_get
)
Expand Down
18 changes: 16 additions & 2 deletions api/reviewer_api/services/documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __updateproperties(
_att_in_properties = []
(
record["pagecount"],
record["processedpagecount"],
record["filename"],
record["documentid"],
record["version"],
Expand Down Expand Up @@ -120,6 +121,7 @@ def __updateproperties(

(
attachment["pagecount"],
attachment["processedpagecount"],
attachment["filename"],
attachment["documentid"],
attachment["version"],
Expand All @@ -138,9 +140,15 @@ def __filterrecords(self, records):
if record["recordid"] is None:
attchments.append(record)
return parentrecords, parentswithattachments, attchments


def __getprocessedpagecount(self, property, pagecount):
if property["documentattribute"]:
return property["documentattribute"].get("processedpagecount", pagecount)
return pagecount

def __getpagecountandfilename(self, record, properties):
pagecount = 0
processedpagecount = 0
filename = record["filename"] if "filename" in record else None
documentid = None
version = 0
Expand All @@ -150,10 +158,11 @@ def __getpagecountandfilename(self, record, properties):
and record["documentmasterid"] == property["documentmasterid"]
):
pagecount = property["pagecount"]
processedpagecount = self.__getprocessedpagecount(property, pagecount)
filename = property["filename"]
documentid = property["documentid"]
version = property["version"]
return pagecount, filename, documentid, version
return pagecount, processedpagecount, filename, documentid, version

def __getduplicatemsgattachment(self, records, attachmentproperties, attachment):
_occurances = []
Expand Down Expand Up @@ -401,6 +410,9 @@ def updatedocumentattributes(self, payload, userid):
)

return DocumentAttributes.update(newRows, oldRows)

def __getprocessedfilepath(self, attributes):
return attributes.get("processedfilepath", None)

def getdocuments(self, requestid,bcgovcode):
divisions_data = requests.request(
Expand Down Expand Up @@ -452,6 +464,8 @@ def getdocuments(self, requestid,bcgovcode):

for documentid in documents:
document = documents[documentid]
documentattributes = document["attributes"]
document["processedfilepath"] = self.__getprocessedfilepath(documentattributes)
documentdivisions = set(
map(lambda d: d["divisionid"], document["attributes"]["divisions"])
)
Expand Down
Binary file modified computingservices/DedupeServices/requirements.txt
Binary file not shown.
14 changes: 9 additions & 5 deletions computingservices/DedupeServices/services/dedupedbservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,31 @@
from datetime import datetime
import json

def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1):
def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1, processedfilepath="", processedpagecount = 1):
conn = getdbconnection()
try:
cursor = conn.cursor()

_incompatible = True if str(dedupeproducermessage.incompatible).lower() == 'true' else False


attributes = {"processedpagecount": processedpagecount} if processedpagecount > 1 else None
cursor.execute('INSERT INTO public."Documents" (version, \
filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer) RETURNING documentid;',
filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount,attributes) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer,%s) RETURNING documentid;',
(1, dedupeproducermessage.filename, dedupeproducermessage.outputdocumentmasterid or dedupeproducermessage.documentmasterid,
dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount))
dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount, json.dumps(attributes)))
conn.commit()
id_of_new_row = cursor.fetchone()

documentattribute = dedupeproducermessage.attributes
if processedfilepath:
documentattribute["processedfilepath"] = processedfilepath
if (dedupeproducermessage.attributes.get('isattachment', False) and dedupeproducermessage.trigger == 'recordreplace'):
documentmasterid = dedupeproducermessage.originaldocumentmasterid or dedupeproducermessage.documentmasterid
else:
documentmasterid = dedupeproducermessage.documentmasterid

cursor.execute('''UPDATE public."DocumentAttributes" SET attributes = %s WHERE documentmasterid = %s''',
(json.dumps(dedupeproducermessage.attributes), documentmasterid))
(json.dumps(documentattribute), documentmasterid))
conn.commit()

cursor.execute('INSERT INTO public."DocumentHashCodes" (documentid, \
Expand Down
4 changes: 2 additions & 2 deletions computingservices/DedupeServices/services/dedupeservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
def processmessage(message):
recordjobstart(message)
try:
hashcode, _pagecount = gets3documenthashcode(message)
savedocumentdetails(message, hashcode, _pagecount)
hashcode, _pagecount, _processedpagecount, _processedfilepath = gets3documenthashcode(message)
savedocumentdetails(message, hashcode, _pagecount, _processedfilepath, _processedpagecount)
recordjobend(message, False)
updateredactionstatus(message)
except(Exception) as error:
Expand Down
147 changes: 141 additions & 6 deletions computingservices/DedupeServices/services/s3documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import requests
from aws_requests_auth.aws_auth import AWSRequestsAuth
from pypdf import PdfReader, PdfWriter
import fitz
from io import BytesIO
from html import escape
import hashlib
Expand All @@ -22,8 +23,10 @@
dedupe_s3_env,
request_management_api,
file_conversion_types,
convert_to_pst
)

# font_path = '../utils/common/BCSans-Regular_2f.ttf'

def __getcredentialsbybcgovcode(bcgovcode):
_conn = getdbconnection()
Expand All @@ -50,13 +53,141 @@ def __getcredentialsbybcgovcode(bcgovcode):

return s3cred

def savedocumenttos3(pdfwithannotations, s3uripath, auth):
uploadresponse = requests.put(s3uripath, data=pdfwithannotations, auth=auth)
uploadresponse.raise_for_status()

def __append_if_exists(text, key, value):
if value:
text += f"{key}: {value}\n"
return text

def extract_annotations_from_pdf(pdf_document, output_bytestream):
all_annotations = []
output_pdf = fitz.open()
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
index = 1
annotations = page.annots()
for annot in annotations:

content = annot.info.get('content', '')
if content:
legend_text = f"Legend [{page_num}:{str(index)}]"
new_content = legend_text + ":The comment text of the annotation is added as part of the pdf."
index += 1
author = annot.info.get('title', '')
if author:
new_author = "Original Document Comment"
annot.set_info(content=new_content,title=new_author)
annot.update()
annot_dict = {
'Legend': legend_text,
'OriginalContent': content,
'Author': author,
'Subject': annot.info.get('subject', ''),
'PageNumber': page_num,
# 'CreationDate': annot.info.get('creationDate', ''),
# 'ModDate': annot.info.get('modDate', ''),
# 'Type': annot.type[1]
}
all_annotations.append(annot_dict)
else:
page.delete_annot(annot)
output_pdf.insert_pdf(pdf_document)
if output_pdf:
output_pdf.save(output_bytestream)
return all_annotations


def __constructannotationtext(annot):
# Construct annotation text
annot_text = ""

annot_text = __append_if_exists(annot_text, 'Legend', annot["Legend"])
annot_text = __append_if_exists(annot_text, 'Subject', annot["Subject"])
annot_text = __append_if_exists(annot_text, 'Author', annot["Author"])
annot_text = __append_if_exists(annot_text, 'Original Content', annot["OriginalContent"])
# creationdate = convert_to_pst(annot['CreationDate']) if annot['CreationDate'] else ''
# moddate = convert_to_pst(annot['ModDate']) if annot['ModDate'] else ''
# annot_text = __append_if_exists(annot_text, 'Annotation Type', annot["Type"])
# annot_text = __append_if_exists(annot_text, 'ModifiedContent', annot["ModifiedContent"])
# annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate)
# annot_text = __append_if_exists(annot_text, 'Modified Date', moddate)
annot_text += "\n"
return annot_text

def add_annotations_as_text_to_pdf(source_document, bytes_stream):
output_bytestream = BytesIO()
annotations = extract_annotations_from_pdf(source_document, output_bytestream)
updated_stream = output_bytestream.getvalue()
updated_document = fitz.open(stream=updated_stream)
processedpagecount = 1
destination_document = fitz.open()
text_line_spacing = 15
page_height = 792
new_page_index = 0
for page_index in range(updated_document.page_count):
if new_page_index == 0:
new_page_index = page_index
text_start_position = 50
annotations_on_page = [annot for annot in annotations if annot.get('PageNumber') == page_index]
for annot in annotations_on_page:
annot_text = __constructannotationtext(annot)
lines_needed = len(annot_text.split('\n'))
print(f'annot_text = {annot_text}')
if text_start_position == 50:
new_page_index += 1
updated_document.insert_page(new_page_index)
new_page = updated_document.load_page(new_page_index)
if text_start_position + lines_needed * text_line_spacing > page_height - 50:
new_page_index += 1
updated_document.insert_page(new_page_index)
new_page = updated_document.load_page(new_page_index)
text_start_position = 50
try:
new_page.insert_text((50, text_start_position), annot_text, fontsize=10)
except Exception as e:
print(f"Error occurred while inserting text: {e}")
text_start_position += lines_needed * text_line_spacing
new_page_index += 1

destination_document.insert_pdf(updated_document)

if destination_document:
processedpagecount = destination_document.page_count
destination_document.save(bytes_stream)
destination_document.close()
del destination_document
return processedpagecount

def handleannotationsinpdf(_bytes, filepath, extension, auth):
try:
bytes_stream = BytesIO()
s3uripath = ""
source_document = fitz.open(stream=_bytes)
processedpagecount = 1
has_annots = source_document.has_annots()
if has_annots:
processedpagecount = add_annotations_as_text_to_pdf(source_document, bytes_stream)
_updatedbytes = bytes_stream.getvalue()
if source_document:
source_document.close()
if len(_updatedbytes) > 0:
# new filename with existing guid filename_updated
s3uripath = path.splitext(filepath)[0] + "_updated" + extension
savedocumenttos3(_updatedbytes, s3uripath, auth)
if bytes_stream:
bytes_stream.close()
del bytes_stream
return processedpagecount, s3uripath
except Exception as e:
print(f"Error occurred while processing pdf with annotations: {e}")

def gets3documenthashcode(producermessage):
s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
pagecount = 1
s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
s3_access_key_id = s3credentials.s3accesskey
s3_secret_access_key = s3credentials.s3secretkey

auth = AWSRequestsAuth(
aws_access_key=s3_access_key_id,
aws_secret_access_key=s3_secret_access_key,
Expand All @@ -65,6 +196,9 @@ def gets3documenthashcode(producermessage):
aws_service=dedupe_s3_service,
)

pagecount = 1
processedpagecount = 1
processedfilepath = ""
_filename, extension = path.splitext(producermessage.filename)
filepath = producermessage.s3filepath
producermessage.attributes = json.loads(producermessage.attributes)
Expand All @@ -76,8 +210,9 @@ def gets3documenthashcode(producermessage):
response = requests.get("{0}".format(filepath), auth=auth, stream=True)
reader = None
if extension.lower() in [".pdf"]:
reader = PdfReader(BytesIO(response.content))
# "No of pages in {0} is {1} ".format(_filename, len(reader.pages)))
_bytes = BytesIO(response.content)
processedpagecount, processedfilepath = handleannotationsinpdf(_bytes, filepath, extension, auth)
reader = PdfReader(_bytes)
pagecount = len(reader.pages)
attachments = []
if reader.attachments:
Expand Down Expand Up @@ -134,4 +269,4 @@ def gets3documenthashcode(producermessage):
for line in response.iter_lines():
sig.update(line)

return (sig.hexdigest(), pagecount)
return (sig.hexdigest(), pagecount, processedpagecount, processedfilepath)
1 change: 1 addition & 0 deletions computingservices/DedupeServices/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .foidedupeconfig import *
from .jsonmessageparser import getdedupeproducermessage,gets3credentialsobject
from .dbconnection import getdbconnection
from .commons.datetimehandler import convert_to_pst
Binary file not shown.
Loading

0 comments on commit e5860a8

Please sign in to comment.