bcgov · abin-aot · Dec 26, 2023 · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/api/reviewer_api/models/DocumentMaster.py b/api/reviewer_api/models/DocumentMaster.py
@@ -208,15 +208,15 @@ def getdocumentproperty(cls, ministryrequestid, deleted):
         documentmasters = []
         try:
             sql = """select dm.documentmasterid,  dm.processingparentid, d.documentid, d.version,
-                        dhc.rank1hash, d.filename, d.pagecount, d.attributes, dm.parentid from "DocumentMaster" dm, 
+                        dhc.rank1hash, d.filename, d.pagecount, dm.parentid from "DocumentMaster" dm, 
                         "Documents" d, "DocumentHashCodes" dhc  
                         where dm.ministryrequestid = :ministryrequestid and dm.ministryrequestid  = d.foiministryrequestid   
                         and dm.documentmasterid = d.documentmasterid 
                         and d.documentid = dhc.documentid order by dm.documentmasterid;"""
             rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
             for row in rs:
                 if (row["processingparentid"] is not None and row["processingparentid"] not in deleted) or (row["processingparentid"] is None and row["documentmasterid"] not in deleted):
-                    documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "documentattribute": row["attributes"], "parentid": row["parentid"], "version": row["version"]})
+                    documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "parentid": row["parentid"], "version": row["version"]})
         except Exception as ex:
             logging.error(ex)
             db.session.close()

diff --git a/api/reviewer_api/resources/foiflowmasterdata.py b/api/reviewer_api/resources/foiflowmasterdata.py
@@ -129,14 +129,11 @@ def post():
             )
 
             documentobjs = []
+            documentids = [documentinfo["file"]["documentid"] for documentinfo in data["documentobjs"]]
+            documents = documentservice().getdocumentbyids(documentids)
             for documentinfo in data["documentobjs"]:
-                filepath = "/".join(documentinfo["file"]["filepath"].split("/")[4:])
-                if documentinfo["file"]["processedfilepath"]:
-                    filepath = "/".join(documentinfo["file"]["processedfilepath"].split("/")[4:])                
+                filepath = "/".join(documents[documentinfo["file"]["documentid"]].split("/")[4:])
                 filename, file_extension = os.path.splitext(filepath)
-                if file_extension in FILE_CONVERSION_FILE_TYPES:
-                    filepath = filename + ".pdf"
-
                 documentinfo["s3url"] = s3client.generate_presigned_url(
                     ClientMethod="get_object",
                     Params={
@@ -315,9 +312,9 @@ def post(ministryrequestid):
                             # for save/put - stitch by division
                         div["s3path_save"] = s3path_save
                     for doc in div["documentlist"]:
-                        filepathlist = doc["filepath"].split("/")[4:]
-                        if doc["processedfilepath"]:
-                            filepathlist = doc["processedfilepath"].split("/")[4:]
+                        realfilepath = documentservice().getfilepathbydocumentid(doc["documentid"])
+                        # filepathlist = doc["filepath"].split("/")[4:]
+                        filepathlist = realfilepath.split("/")[4:]
 
                         # for load/get
                         filepath_get = "/".join(filepathlist)

diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py
@@ -74,7 +74,6 @@ def __updateproperties(
             _att_in_properties = []
             (
                 record["pagecount"],
-                record["processedpagecount"],
                 record["filename"],
                 record["documentid"],
                 record["version"],
@@ -121,7 +120,6 @@ def __updateproperties(
 
                     (
                         attachment["pagecount"],
-                        attachment["processedpagecount"],
                         attachment["filename"],
                         attachment["documentid"],
                         attachment["version"],
@@ -140,15 +138,9 @@ def __filterrecords(self, records):
             if record["recordid"] is None:
                 attchments.append(record)
         return parentrecords, parentswithattachments, attchments
-
-    def __getprocessedpagecount(self, property, pagecount):
-        if property["documentattribute"]:
-            return property["documentattribute"].get("processedpagecount", pagecount)
-        return pagecount
-
+
     def __getpagecountandfilename(self, record, properties):
         pagecount = 0
-        processedpagecount = 0
         filename = record["filename"] if "filename" in record else None
         documentid = None
         version = 0
@@ -158,11 +150,10 @@ def __getpagecountandfilename(self, record, properties):
                 and record["documentmasterid"] == property["documentmasterid"]
             ):
                 pagecount = property["pagecount"]
-                processedpagecount = self.__getprocessedpagecount(property, pagecount)
                 filename = property["filename"]
                 documentid = property["documentid"]
                 version = property["version"]
-        return pagecount, processedpagecount, filename, documentid, version
+        return pagecount, filename, documentid, version
 
     def __getduplicatemsgattachment(self, records, attachmentproperties, attachment):
         _occurances = []
@@ -411,9 +402,7 @@ def updatedocumentattributes(self, payload, userid):
 
         return DocumentAttributes.update(newRows, oldRows)
 
-    def __getprocessedfilepath(self, attributes):
-            return attributes.get("processedfilepath", None)
-
+
     def getdocuments(self, requestid,bcgovcode):
         divisions_data = requests.request(
                 method='GET',
@@ -464,8 +453,6 @@ def getdocuments(self, requestid,bcgovcode):
 
         for documentid in documents:
             document = documents[documentid]
-            documentattributes = document["attributes"]
-            document["processedfilepath"] = self.__getprocessedfilepath(documentattributes)
             documentdivisions = set(
                 map(lambda d: d["divisionid"], document["attributes"]["divisions"])
             )

diff --git a/computingservices/DedupeServices/services/dedupedbservice.py b/computingservices/DedupeServices/services/dedupedbservice.py
@@ -3,31 +3,27 @@
 from datetime import datetime
 import json
 
-def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1, processedfilepath="", processedpagecount = 1):
+def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1):
     conn = getdbconnection()
     try:        
         cursor = conn.cursor()
-               
+
         _incompatible = True if str(dedupeproducermessage.incompatible).lower() == 'true' else False
 
-        attributes = {"processedpagecount": processedpagecount} if processedpagecount > 1 else None
         cursor.execute('INSERT INTO public."Documents" (version, \
-        filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount,attributes) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer,%s) RETURNING documentid;',
+        filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer) RETURNING documentid;',
         (1, dedupeproducermessage.filename, dedupeproducermessage.outputdocumentmasterid or dedupeproducermessage.documentmasterid,
-        dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount, json.dumps(attributes)))
+        dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount))
         conn.commit()
         id_of_new_row = cursor.fetchone()
 
-        documentattribute = dedupeproducermessage.attributes
-        if processedfilepath:
-            documentattribute["processedfilepath"] = processedfilepath
         if (dedupeproducermessage.attributes.get('isattachment', False) and dedupeproducermessage.trigger == 'recordreplace'):
             documentmasterid = dedupeproducermessage.originaldocumentmasterid or dedupeproducermessage.documentmasterid
         else:
             documentmasterid = dedupeproducermessage.documentmasterid
 
         cursor.execute('''UPDATE public."DocumentAttributes" SET attributes = %s WHERE documentmasterid = %s''',
-          (json.dumps(documentattribute), documentmasterid))
+          (json.dumps(dedupeproducermessage.attributes), documentmasterid))
         conn.commit()
 
         cursor.execute('INSERT INTO public."DocumentHashCodes" (documentid, \

diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py
@@ -7,8 +7,8 @@
 def processmessage(message):
     recordjobstart(message)
     try:
-        hashcode, _pagecount, _processedpagecount, _processedfilepath = gets3documenthashcode(message)
-        savedocumentdetails(message, hashcode, _pagecount, _processedfilepath, _processedpagecount)
+        hashcode, _pagecount = gets3documenthashcode(message)
+        savedocumentdetails(message, hashcode, _pagecount)
         recordjobend(message, False)
         updateredactionstatus(message)
     except(Exception) as error:

diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
@@ -8,7 +8,6 @@
 import requests
 from aws_requests_auth.aws_auth import AWSRequestsAuth
 from pypdf import PdfReader, PdfWriter
-import fitz
 from io import BytesIO
 from html import escape
 import hashlib
@@ -22,8 +21,7 @@
     dedupe_s3_service,
     dedupe_s3_env,
     request_management_api,
-    file_conversion_types,
-    convert_to_pst
+    file_conversion_types
 )
 
 def __getcredentialsbybcgovcode(bcgovcode):
@@ -51,135 +49,6 @@ def __getcredentialsbybcgovcode(bcgovcode):
 
     return s3cred
 
-def savedocumenttos3(pdfwithannotations, s3uripath, auth):
-    uploadresponse = requests.put(s3uripath, data=pdfwithannotations, auth=auth)
-    uploadresponse.raise_for_status()
-
-def __append_if_exists(text, key, value):
-    if value:
-        text += f"{key}: {value}\n"
-    return text
-
-def extract_annotations_from_pdf(pdf_document, output_bytestream):
-    all_annotations = []
-    output_pdf = fitz.open()
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document.load_page(page_num)
-        index = 1
-        annotations = page.annots()
-        for annot in annotations:
-
-            content = annot.info.get('content', '')
-            if content:
-                legend_text = f"Legend [{page_num}:{str(index)}]"
-                new_content = legend_text + ":The comment text of the annotation is added as part of the pdf."
-                index += 1
-                author = annot.info.get('title', '')
-                new_author = "Original Document Comment"              
-                annot.set_info(content=new_content,title=new_author)
-                annot.update()
-                annot_dict = {
-                'Legend': legend_text,
-                'OriginalContent': content,
-                'Author': author,                
-                'Subject': annot.info.get('subject', ''),
-                'PageNumber': page_num,
-                # 'CreationDate': annot.info.get('creationDate', ''),
-                # 'ModDate': annot.info.get('modDate', ''),
-                # 'Type': annot.type[1]
-                }
-                all_annotations.append(annot_dict)
-            else:
-                page.delete_annot(annot)            
-    output_pdf.insert_pdf(pdf_document)
-    if output_pdf:
-        output_pdf.save(output_bytestream)
-    return all_annotations
-
-
-def __constructannotationtext(annot):
-    # Construct annotation text
-    annot_text = ""
-
-    annot_text = __append_if_exists(annot_text, 'Legend', annot["Legend"])
-    annot_text = __append_if_exists(annot_text, 'Subject', annot["Subject"])
-    annot_text = __append_if_exists(annot_text, 'Author', annot["Author"])
-    annot_text = __append_if_exists(annot_text, 'Original Content', annot["OriginalContent"])
-    # creationdate = convert_to_pst(annot['CreationDate']) if annot['CreationDate'] else ''
-    # moddate = convert_to_pst(annot['ModDate']) if annot['ModDate'] else ''
-    # annot_text = __append_if_exists(annot_text, 'Annotation Type', annot["Type"])
-    # annot_text = __append_if_exists(annot_text, 'ModifiedContent', annot["ModifiedContent"])
-    # annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate)
-    # annot_text = __append_if_exists(annot_text, 'Modified Date', moddate)
-    annot_text += "\n"
-    return annot_text
-
-def add_annotations_as_text_to_pdf(source_document, bytes_stream):
-    output_bytestream = BytesIO()
-    annotations = extract_annotations_from_pdf(source_document, output_bytestream)
-    updated_stream = output_bytestream.getvalue()
-    updated_document = fitz.open(stream=updated_stream)
-    processedpagecount = 1
-    destination_document = fitz.open()
-    text_line_spacing = 15
-    page_height = 792
-    new_page_index = 0
-    for page_index in range(updated_document.page_count):
-        if new_page_index == 0:
-            new_page_index = page_index
-        text_start_position = 50
-        annotations_on_page = [annot for annot in annotations if annot.get('PageNumber') == page_index]
-        for annot in annotations_on_page:
-            annot_text = __constructannotationtext(annot)
-            lines_needed = len(annot_text.split('\n'))
-            if text_start_position == 50:
-                new_page_index += 1
-                updated_document.insert_page(new_page_index)
-                new_page = updated_document.load_page(new_page_index)
-            if text_start_position + lines_needed * text_line_spacing > page_height - 50:
-                new_page_index += 1
-                updated_document.insert_page(new_page_index)
-                new_page = updated_document.load_page(new_page_index)
-                text_start_position = 50
-            try:
-                new_page.insert_text((50, text_start_position), annot_text, fontsize=10)
-            except Exception as e:
-                print(f"Error occurred while inserting text: {e}")
-            text_start_position += lines_needed * text_line_spacing
-        new_page_index += 1    
-
-    destination_document.insert_pdf(updated_document)    
-
-    if destination_document:
-        processedpagecount = destination_document.page_count
-        destination_document.save(bytes_stream)
-        destination_document.close()
-        del destination_document
-    return processedpagecount
-
-def handleannotationsinpdf(_bytes, filepath, extension, auth):
-    try:
-        bytes_stream = BytesIO()
-        s3uripath = ""
-        source_document = fitz.open(stream=_bytes)
-        processedpagecount = 1
-        has_annots = source_document.has_annots()
-        if has_annots:
-            processedpagecount = add_annotations_as_text_to_pdf(source_document, bytes_stream)
-        _updatedbytes = bytes_stream.getvalue()
-        if source_document:
-            source_document.close()
-        if len(_updatedbytes) > 0:
-            # new filename with existing guid filename_updated
-            s3uripath = path.splitext(filepath)[0] + "_updated" + extension
-            savedocumenttos3(_updatedbytes, s3uripath, auth)
-        if bytes_stream:
-            bytes_stream.close()
-            del bytes_stream
-        return processedpagecount, s3uripath
-    except Exception as e:
-        print(f"Error occurred while processing pdf with annotations: {e}")
-
 def gets3documenthashcode(producermessage):
     s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)    
     s3_access_key_id = s3credentials.s3accesskey
@@ -193,8 +62,6 @@ def gets3documenthashcode(producermessage):
     )
 
     pagecount = 1
-    processedpagecount = 1
-    processedfilepath = ""
     _filename, extension = path.splitext(producermessage.filename)
     filepath = producermessage.s3filepath
     producermessage.attributes = json.loads(producermessage.attributes)
@@ -206,9 +73,9 @@ def gets3documenthashcode(producermessage):
     response = requests.get("{0}".format(filepath), auth=auth, stream=True)
     reader = None
     if extension.lower() in [".pdf"]:
-        _bytes = BytesIO(response.content)
-        processedpagecount, processedfilepath = handleannotationsinpdf(_bytes, filepath, extension, auth)
-        reader = PdfReader(_bytes)
+        reader = PdfReader(BytesIO(response.content))
+
+        # "No of pages in {0} is {1} ".format(_filename, len(reader.pages)))
         pagecount = len(reader.pages)
         attachments = []
         if reader.attachments:
@@ -265,4 +132,4 @@ def gets3documenthashcode(producermessage):
     for line in response.iter_lines():
         sig.update(line)
 
-    return (sig.hexdigest(), pagecount, processedpagecount, processedfilepath)
+    return (sig.hexdigest(), pagecount)
diff --git a/computingservices/DedupeServices/utils/__init__.py b/computingservices/DedupeServices/utils/__init__.py
@@ -3,4 +3,3 @@
 from .foidedupeconfig import *
 from .jsonmessageparser import getdedupeproducermessage,gets3credentialsobject
 from .dbconnection import getdbconnection
-from .commons.datetimehandler import convert_to_pst
diff --git a/computingservices/DedupeServices/utils/commons/datetimehandler.py b/computingservices/DedupeServices/utils/commons/datetimehandler.py