Merge pull request #665 from bcgov/dev-DV-4708

Flatten PDF - Dedupe, web, api changes
bcgov · Dec 20, 2023 · e5860a8 · e5860a8
2 parents a48441e + f8a1fe0
commit e5860a8
Show file tree

Hide file tree

Showing 13 changed files with 226 additions and 45 deletions.
diff --git a/api/reviewer_api/models/DocumentMaster.py b/api/reviewer_api/models/DocumentMaster.py
@@ -208,15 +208,15 @@ def getdocumentproperty(cls, ministryrequestid, deleted):
         documentmasters = []
         try:
             sql = """select dm.documentmasterid,  dm.processingparentid, d.documentid, d.version,
-                        dhc.rank1hash, d.filename, d.pagecount, dm.parentid from "DocumentMaster" dm, 
+                        dhc.rank1hash, d.filename, d.pagecount, d.attributes, dm.parentid from "DocumentMaster" dm, 
                         "Documents" d, "DocumentHashCodes" dhc  
                         where dm.ministryrequestid = :ministryrequestid and dm.ministryrequestid  = d.foiministryrequestid   
                         and dm.documentmasterid = d.documentmasterid 
                         and d.documentid = dhc.documentid order by dm.documentmasterid;"""
             rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
             for row in rs:
                 if (row["processingparentid"] is not None and row["processingparentid"] not in deleted) or (row["processingparentid"] is None and row["documentmasterid"] not in deleted):
-                    documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "parentid": row["parentid"], "version": row["version"]})
+                    documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "documentattribute": row["attributes"], "parentid": row["parentid"], "version": row["version"]})
         except Exception as ex:
             logging.error(ex)
             db.session.close()

diff --git a/api/reviewer_api/resources/foiflowmasterdata.py b/api/reviewer_api/resources/foiflowmasterdata.py
@@ -36,6 +36,7 @@
 
 from reviewer_api.services.radactionservice import redactionservice
 from reviewer_api.services.documentservice import documentservice
+from reviewer_api.utils.constants import FILE_CONVERSION_FILE_TYPES
 
 API = Namespace(
     "FOI Flow Master Data", description="Endpoints for FOI Flow master data"
@@ -128,11 +129,14 @@ def post():
             )
 
             documentobjs = []
-            documentids = [documentinfo["file"]["documentid"] for documentinfo in data["documentobjs"]]
-            documents = documentservice().getdocumentbyids(documentids)
             for documentinfo in data["documentobjs"]:
-                filepath = "/".join(documents[documentinfo["file"]["documentid"]].split("/")[4:])
+                filepath = "/".join(documentinfo["file"]["filepath"].split("/")[4:])
+                if documentinfo["file"]["processedfilepath"]:
+                    filepath = "/".join(documentinfo["file"]["processedfilepath"].split("/")[4:])                
                 filename, file_extension = os.path.splitext(filepath)
+                if file_extension in FILE_CONVERSION_FILE_TYPES:
+                    filepath = filename + ".pdf"
+
                 documentinfo["s3url"] = s3client.generate_presigned_url(
                     ClientMethod="get_object",
                     Params={
@@ -297,8 +301,6 @@ def post(ministryrequestid):
                             filepathlist[0], division_name
                         )
 
-                            # filename_put, file_extension_put = os.path.splitext(filepath_put)
-                            # filepath_put = filename_put+'.pdf'
                         s3path_save = s3client.generate_presigned_url(
                             ClientMethod="get_object",
                             Params={
@@ -313,11 +315,13 @@ def post(ministryrequestid):
                             # for save/put - stitch by division
                         div["s3path_save"] = s3path_save
                     for doc in div["documentlist"]:
-                        realfilepath = documentservice().getfilepathbydocumentid(doc["documentid"])
-                        # filepathlist = doc["filepath"].split("/")[4:]
-                        filepathlist = realfilepath.split("/")[4:]
+                        filepathlist = doc["filepath"].split("/")[4:]
+                        if doc["processedfilepath"]:
+                            filepathlist = doc["processedfilepath"].split("/")[4:]
+
                         # for load/get
                         filepath_get = "/".join(filepathlist)
+
                         filename_get, file_extension_get = os.path.splitext(
                                         filepath_get
                             )

diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py
@@ -74,6 +74,7 @@ def __updateproperties(
             _att_in_properties = []
             (
                 record["pagecount"],
+                record["processedpagecount"],
                 record["filename"],
                 record["documentid"],
                 record["version"],
@@ -120,6 +121,7 @@ def __updateproperties(
 
                     (
                         attachment["pagecount"],
+                        attachment["processedpagecount"],
                         attachment["filename"],
                         attachment["documentid"],
                         attachment["version"],
@@ -138,9 +140,15 @@ def __filterrecords(self, records):
             if record["recordid"] is None:
                 attchments.append(record)
         return parentrecords, parentswithattachments, attchments
-
+
+    def __getprocessedpagecount(self, property, pagecount):
+        if property["documentattribute"]:
+            return property["documentattribute"].get("processedpagecount", pagecount)
+        return pagecount
+
     def __getpagecountandfilename(self, record, properties):
         pagecount = 0
+        processedpagecount = 0
         filename = record["filename"] if "filename" in record else None
         documentid = None
         version = 0
@@ -150,10 +158,11 @@ def __getpagecountandfilename(self, record, properties):
                 and record["documentmasterid"] == property["documentmasterid"]
             ):
                 pagecount = property["pagecount"]
+                processedpagecount = self.__getprocessedpagecount(property, pagecount)
                 filename = property["filename"]
                 documentid = property["documentid"]
                 version = property["version"]
-        return pagecount, filename, documentid, version
+        return pagecount, processedpagecount, filename, documentid, version
 
     def __getduplicatemsgattachment(self, records, attachmentproperties, attachment):
         _occurances = []
@@ -401,6 +410,9 @@ def updatedocumentattributes(self, payload, userid):
             )
 
         return DocumentAttributes.update(newRows, oldRows)
+
+    def __getprocessedfilepath(self, attributes):
+            return attributes.get("processedfilepath", None)
 
     def getdocuments(self, requestid,bcgovcode):
         divisions_data = requests.request(
@@ -452,6 +464,8 @@ def getdocuments(self, requestid,bcgovcode):
 
         for documentid in documents:
             document = documents[documentid]
+            documentattributes = document["attributes"]
+            document["processedfilepath"] = self.__getprocessedfilepath(documentattributes)
             documentdivisions = set(
                 map(lambda d: d["divisionid"], document["attributes"]["divisions"])
             )

diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt
diff --git a/computingservices/DedupeServices/services/dedupedbservice.py b/computingservices/DedupeServices/services/dedupedbservice.py
@@ -3,27 +3,31 @@
 from datetime import datetime
 import json
 
-def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1):
+def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1, processedfilepath="", processedpagecount = 1):
     conn = getdbconnection()
     try:        
         cursor = conn.cursor()
 
         _incompatible = True if str(dedupeproducermessage.incompatible).lower() == 'true' else False
-
+
+        attributes = {"processedpagecount": processedpagecount} if processedpagecount > 1 else None
         cursor.execute('INSERT INTO public."Documents" (version, \
-        filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer) RETURNING documentid;',
+        filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount,attributes) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer,%s) RETURNING documentid;',
         (1, dedupeproducermessage.filename, dedupeproducermessage.outputdocumentmasterid or dedupeproducermessage.documentmasterid,
-        dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount))
+        dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount, json.dumps(attributes)))
         conn.commit()
         id_of_new_row = cursor.fetchone()
 
+        documentattribute = dedupeproducermessage.attributes
+        if processedfilepath:
+            documentattribute["processedfilepath"] = processedfilepath
         if (dedupeproducermessage.attributes.get('isattachment', False) and dedupeproducermessage.trigger == 'recordreplace'):
             documentmasterid = dedupeproducermessage.originaldocumentmasterid or dedupeproducermessage.documentmasterid
         else:
             documentmasterid = dedupeproducermessage.documentmasterid
 
         cursor.execute('''UPDATE public."DocumentAttributes" SET attributes = %s WHERE documentmasterid = %s''',
-          (json.dumps(dedupeproducermessage.attributes), documentmasterid))
+          (json.dumps(documentattribute), documentmasterid))
         conn.commit()
 
         cursor.execute('INSERT INTO public."DocumentHashCodes" (documentid, \

diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py
@@ -7,8 +7,8 @@
 def processmessage(message):
     recordjobstart(message)
     try:
-        hashcode, _pagecount = gets3documenthashcode(message)
-        savedocumentdetails(message, hashcode, _pagecount)
+        hashcode, _pagecount, _processedpagecount, _processedfilepath = gets3documenthashcode(message)
+        savedocumentdetails(message, hashcode, _pagecount, _processedfilepath, _processedpagecount)
         recordjobend(message, False)
         updateredactionstatus(message)
     except(Exception) as error:

diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
@@ -8,6 +8,7 @@
 import requests
 from aws_requests_auth.aws_auth import AWSRequestsAuth
 from pypdf import PdfReader, PdfWriter
+import fitz
 from io import BytesIO
 from html import escape
 import hashlib
@@ -22,8 +23,10 @@
     dedupe_s3_env,
     request_management_api,
     file_conversion_types,
+    convert_to_pst
 )
 
+# font_path = '../utils/common/BCSans-Regular_2f.ttf'
 
 def __getcredentialsbybcgovcode(bcgovcode):
     _conn = getdbconnection()
@@ -50,13 +53,141 @@ def __getcredentialsbybcgovcode(bcgovcode):
 
     return s3cred
 
+def savedocumenttos3(pdfwithannotations, s3uripath, auth):
+    uploadresponse = requests.put(s3uripath, data=pdfwithannotations, auth=auth)
+    uploadresponse.raise_for_status()
+
+def __append_if_exists(text, key, value):
+    if value:
+        text += f"{key}: {value}\n"
+    return text
+
+def extract_annotations_from_pdf(pdf_document, output_bytestream):
+    all_annotations = []
+    output_pdf = fitz.open()
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_num)
+        index = 1
+        annotations = page.annots()
+        for annot in annotations:
+
+            content = annot.info.get('content', '')
+            if content:
+                legend_text = f"Legend [{page_num}:{str(index)}]"
+                new_content = legend_text + ":The comment text of the annotation is added as part of the pdf."
+                index += 1
+                author = annot.info.get('title', '')
+                if author:
+                    new_author = "Original Document Comment"
+                annot.set_info(content=new_content,title=new_author)
+                annot.update()
+                annot_dict = {
+                'Legend': legend_text,
+                'OriginalContent': content,
+                'Author': author,                
+                'Subject': annot.info.get('subject', ''),
+                'PageNumber': page_num,
+                # 'CreationDate': annot.info.get('creationDate', ''),
+                # 'ModDate': annot.info.get('modDate', ''),
+                # 'Type': annot.type[1]
+                }
+                all_annotations.append(annot_dict)
+            else:
+                page.delete_annot(annot)            
+    output_pdf.insert_pdf(pdf_document)
+    if output_pdf:
+        output_pdf.save(output_bytestream)
+    return all_annotations
+
+
+def __constructannotationtext(annot):
+    # Construct annotation text
+    annot_text = ""
+
+    annot_text = __append_if_exists(annot_text, 'Legend', annot["Legend"])
+    annot_text = __append_if_exists(annot_text, 'Subject', annot["Subject"])
+    annot_text = __append_if_exists(annot_text, 'Author', annot["Author"])
+    annot_text = __append_if_exists(annot_text, 'Original Content', annot["OriginalContent"])
+    # creationdate = convert_to_pst(annot['CreationDate']) if annot['CreationDate'] else ''
+    # moddate = convert_to_pst(annot['ModDate']) if annot['ModDate'] else ''
+    # annot_text = __append_if_exists(annot_text, 'Annotation Type', annot["Type"])
+    # annot_text = __append_if_exists(annot_text, 'ModifiedContent', annot["ModifiedContent"])
+    # annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate)
+    # annot_text = __append_if_exists(annot_text, 'Modified Date', moddate)
+    annot_text += "\n"
+    return annot_text
+
+def add_annotations_as_text_to_pdf(source_document, bytes_stream):
+    output_bytestream = BytesIO()
+    annotations = extract_annotations_from_pdf(source_document, output_bytestream)
+    updated_stream = output_bytestream.getvalue()
+    updated_document = fitz.open(stream=updated_stream)
+    processedpagecount = 1
+    destination_document = fitz.open()
+    text_line_spacing = 15
+    page_height = 792
+    new_page_index = 0
+    for page_index in range(updated_document.page_count):
+        if new_page_index == 0:
+            new_page_index = page_index
+        text_start_position = 50
+        annotations_on_page = [annot for annot in annotations if annot.get('PageNumber') == page_index]
+        for annot in annotations_on_page:
+            annot_text = __constructannotationtext(annot)
+            lines_needed = len(annot_text.split('\n'))
+            print(f'annot_text = {annot_text}')
+            if text_start_position == 50:
+                new_page_index += 1
+                updated_document.insert_page(new_page_index)
+                new_page = updated_document.load_page(new_page_index)
+            if text_start_position + lines_needed * text_line_spacing > page_height - 50:
+                new_page_index += 1
+                updated_document.insert_page(new_page_index)
+                new_page = updated_document.load_page(new_page_index)
+                text_start_position = 50
+            try:
+                new_page.insert_text((50, text_start_position), annot_text, fontsize=10)
+            except Exception as e:
+                print(f"Error occurred while inserting text: {e}")
+            text_start_position += lines_needed * text_line_spacing
+        new_page_index += 1    
+
+    destination_document.insert_pdf(updated_document)    
+
+    if destination_document:
+        processedpagecount = destination_document.page_count
+        destination_document.save(bytes_stream)
+        destination_document.close()
+        del destination_document
+    return processedpagecount
+
+def handleannotationsinpdf(_bytes, filepath, extension, auth):
+    try:
+        bytes_stream = BytesIO()
+        s3uripath = ""
+        source_document = fitz.open(stream=_bytes)
+        processedpagecount = 1
+        has_annots = source_document.has_annots()
+        if has_annots:
+            processedpagecount = add_annotations_as_text_to_pdf(source_document, bytes_stream)
+        _updatedbytes = bytes_stream.getvalue()
+        if source_document:
+            source_document.close()
+        if len(_updatedbytes) > 0:
+            # new filename with existing guid filename_updated
+            s3uripath = path.splitext(filepath)[0] + "_updated" + extension
+            savedocumenttos3(_updatedbytes, s3uripath, auth)
+        if bytes_stream:
+            bytes_stream.close()
+            del bytes_stream
+        return processedpagecount, s3uripath
+    except Exception as e:
+        print(f"Error occurred while processing pdf with annotations: {e}")
 
 def gets3documenthashcode(producermessage):
-    s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
-    pagecount = 1
+    s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)    
     s3_access_key_id = s3credentials.s3accesskey
     s3_secret_access_key = s3credentials.s3secretkey
-
     auth = AWSRequestsAuth(
         aws_access_key=s3_access_key_id,
         aws_secret_access_key=s3_secret_access_key,
@@ -65,6 +196,9 @@ def gets3documenthashcode(producermessage):
         aws_service=dedupe_s3_service,
     )
 
+    pagecount = 1
+    processedpagecount = 1
+    processedfilepath = ""
     _filename, extension = path.splitext(producermessage.filename)
     filepath = producermessage.s3filepath
     producermessage.attributes = json.loads(producermessage.attributes)
@@ -76,8 +210,9 @@ def gets3documenthashcode(producermessage):
     response = requests.get("{0}".format(filepath), auth=auth, stream=True)
     reader = None
     if extension.lower() in [".pdf"]:
-        reader = PdfReader(BytesIO(response.content))
-        # "No of pages in {0} is {1} ".format(_filename, len(reader.pages)))
+        _bytes = BytesIO(response.content)
+        processedpagecount, processedfilepath = handleannotationsinpdf(_bytes, filepath, extension, auth)
+        reader = PdfReader(_bytes)
         pagecount = len(reader.pages)
         attachments = []
         if reader.attachments:
@@ -134,4 +269,4 @@ def gets3documenthashcode(producermessage):
     for line in response.iter_lines():
         sig.update(line)
 
-    return (sig.hexdigest(), pagecount)
+    return (sig.hexdigest(), pagecount, processedpagecount, processedfilepath)
diff --git a/computingservices/DedupeServices/utils/__init__.py b/computingservices/DedupeServices/utils/__init__.py
@@ -3,3 +3,4 @@
 from .foidedupeconfig import *
 from .jsonmessageparser import getdedupeproducermessage,gets3credentialsobject
 from .dbconnection import getdbconnection
+from .commons.datetimehandler import convert_to_pst
diff --git a/computingservices/DedupeServices/utils/commons/BCSans-Regular_2f.ttf b/computingservices/DedupeServices/utils/commons/BCSans-Regular_2f.ttf