Merge branch 'dev-AS-FOIMOD-3436' into test-rook-FOIMOD-3436

bcgov · Sep 26, 2024 · a6610d0 · a6610d0
2 parents 4beca36 + 7d13478
commit a6610d0
Show file tree

Hide file tree

Showing 26 changed files with 433 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -100,3 +100,5 @@ bld/
 MCS.FOI.S3FileConversion/MCS.FOI.S3FileConversion/QtBinariesWindows/
 computingservices/ZippingServices/env/*
 openshift/templates/zippingservice/zipper.env
+*.locenv
+
diff --git a/api/reviewer_api/resources/redaction.py b/api/reviewer_api/resources/redaction.py
@@ -342,7 +342,7 @@ def post():
         try:
             requestjson = request.get_json()
             print("\nrequestjson:",requestjson)
-            if(requestjson['bcgovcode'] == "mcf"):
+            if(requestjson['bcgovcode'] == "mcf" and requestjson['requesttype'] == "personal"):
                 finalpackageschema = MCFFinalPackageSchema().load(requestjson) 
             else:
                 finalpackageschema = FinalPackageSchema().load(requestjson)

diff --git a/api/reviewer_api/schemas/finalpackage.py b/api/reviewer_api/schemas/finalpackage.py
@@ -30,6 +30,7 @@ class FinalPackageSchema(Schema):
     )
     summarydocuments = fields.Nested(SummarySchema, allow_none=True)
     redactionlayerid = fields.Int(data_key="redactionlayerid", allow_none=False)
+    requesttype = fields.Str(data_key="requesttype", allow_none=False)
 
 class SummaryRecordSchema(Schema):
     recordname = fields.Str(data_key="recordname", allow_none=True)
@@ -53,4 +54,5 @@ class MCFFinalPackageSchema(Schema):
         AttributeSchema, many=True, required=True, allow_none=False
     )
     summarydocuments = fields.Nested(MCFSummarySchema, allow_none=True)
-    redactionlayerid = fields.Int(data_key="redactionlayerid", allow_none=False)
+    redactionlayerid = fields.Int(data_key="redactionlayerid", allow_none=False)
+    requesttype = fields.Str(data_key="requesttype", allow_none=False)
diff --git a/api/reviewer_api/schemas/redline.py b/api/reviewer_api/schemas/redline.py
@@ -29,4 +29,5 @@ class RedlineSchema(Schema):
         AttributeSchema, many=True, required=True, allow_none=False
     )
     summarydocuments = fields.Nested(SummarySchema, allow_none=True)
-    redactionlayerid = fields.Int(data_key="redactionlayerid", allow_none=False)
+    redactionlayerid = fields.Int(data_key="redactionlayerid", allow_none=False)
+    requesttype = fields.Str(data_key="requesttype", allow_none=False)
diff --git a/api/reviewer_api/services/radactionservice.py b/api/reviewer_api/services/radactionservice.py
@@ -145,7 +145,8 @@ def __preparemessageforsummaryservice(self, messageschema, userinfo, job):
             "finaloutput": to_json(""),
             "attributes": to_json(messageschema["attributes"]),
             "summarydocuments": json.dumps(messageschema["summarydocuments"]),
-            "redactionlayerid": json.dumps(messageschema["redactionlayerid"])
+            "redactionlayerid": json.dumps(messageschema["redactionlayerid"]),
+            "requesttype": messageschema["requesttype"],
         }
         return _message
 

diff --git a/computingservices/DedupeServices/requirements.txt b/computingservices/DedupeServices/requirements.txt
diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py
@@ -12,8 +12,11 @@
 from html import escape
 import hashlib
 import uuid
+import boto3
+from botocore.config import Config
 from re import sub
 import fitz
+import PyPDF2
 from utils import (
     gets3credentialsobject,
     getdedupeproducermessage,
@@ -24,6 +27,22 @@
     request_management_api,
     file_conversion_types
 )
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.pdfbase import pdfmetrics
+from reportlab.lib.pagesizes import letter
+import os
+
+# Get the directory of the current Python file (inside the 'service' folder)
+service_folder_path = os.path.dirname(os.path.abspath(__file__))
+# Navigate to the parent directory (common folder)
+common_folder_path = os.path.dirname(service_folder_path)
+# Construct the path to the 'utils' folder & get the path to the 'BCSans-Bold.ttf' font file inside the 'utils' folder
+utils_folder_path = os.path.join(common_folder_path, "utils")
+font_path = os.path.join(utils_folder_path, "fonts", "BCSans-Regular_2f.ttf")
+pdfmetrics.registerFont(TTFont('BC-Sans', font_path))
+
 
 def __getcredentialsbybcgovcode(bcgovcode):
     _conn = getdbconnection()
@@ -93,6 +112,190 @@ def _generate_file_attachments(producermessage, reader, auth):
                     # file_attachments.append(attachment)
     return file_attachments
 
+# New function to split comments into pages
+def split_comments_to_pages(comments,font,font_size, canvas, lines_per_page=50):
+    pages = []
+    current_page = []
+    for comment in comments:
+        print("\n Each comment:",comment)
+        if 'text' in comment:
+            comment_text = f"{comment['text']}"
+            #comment_text = f"Page {comment['page']}: {comment['text']}\n"
+            # Wrap the text to fit within the page width
+            wrapped_lines = wrap_text(comment_text, width=500, font=font, font_size=font_size, canvas=canvas)
+            for line in wrapped_lines:
+                current_page.append(line)
+                if len(current_page) >= lines_per_page:
+                    pages.append(current_page)
+                    current_page = []   
+    if current_page:  # Add any remaining comments to the last page
+        pages.append(current_page)    
+    print("pages-split_comments_to_pages:",pages)
+    return pages
+
+
+def wrap_text(text, width, font, font_size, canvas):
+    """
+    This function wraps text into lines based on the available width.
+    """
+    wrapped_lines = []
+    line = ""    
+    text_width = canvas.stringWidth(text, font, font_size)
+    if text_width <= width:
+        line = text
+    else:
+        words = text.split(" ")
+        for word in words:
+            # Check the width of the line with the current word
+            line_width = canvas.stringWidth(line + word + " ", font, font_size)
+            if line_width <= width:
+                line += word + " "
+            else:
+                #If the word doesn't fit, append the current line and start a new one
+                if line:
+                    print("line::",line)
+                    wrapped_lines.append(line)  # Append current line
+                line = ""  # Reset line
+                # Handle long words that need to be broken up
+                while canvas.stringWidth(word, font, font_size) > width:
+                    # Find the largest part of the word that fits within the width
+                    for i in range(1, len(word) + 1):
+                        part = word[:i]
+                        if canvas.stringWidth(part, font, font_size) > width:
+                            # Append the part that fits and continue with the remaining part
+                            wrapped_lines.append(word[:i - 1])  # Add the part that fits
+                            word = word[i - 1:]  # Remaining part of the word
+                            break
+                # Add the remaining part of the word to the line
+                line = word + " "
+    # Append the last line
+    if line:
+        wrapped_lines.append(line)
+    return wrapped_lines
+
+def _clearmetadata(response, pagecount, reader, s3_access_key_id,s3_secret_access_key,filepath,auth):
+    # clear metadata
+    reader2 = PyPDF2.PdfReader(BytesIO(response.content))
+    # Check if metadata exists.
+    if reader2.metadata is not None:
+        # Create a new PDF file without metadata.
+        writer = PyPDF2.PdfWriter()
+        # Copy pages from the original PDF to the new PDF.
+        for page_num in range(len(reader.pages)):
+            page = reader2.pages[page_num]                
+            try:
+                #Function to get all comments type annotations & copy it to a new page
+                pagecount, writer= createpagesforcomments(page, page_num, writer, reader2, pagecount)
+            except Exception as e:
+                print(f"Error in creating new page with comment annotations: {e}")
+        buffer = BytesIO()
+        writer.write(buffer)
+        try:
+            # Now, flatten the PDF content using the __flattenfitz function
+            flattened_buffer = __flattenfitz(buffer.getvalue())
+        except Exception as e:
+            print(f"Error in flatenning pdf: {e}")
+
+        client = boto3.client('s3',config=Config(signature_version='s3v4'),
+            endpoint_url='https://{0}/'.format(dedupe_s3_host),
+            aws_access_key_id= s3_access_key_id,
+            aws_secret_access_key= s3_secret_access_key,
+            region_name= dedupe_s3_region
+        )
+        copyresponse = client.copy_object(
+            CopySource="/" + "/".join(filepath.split("/")[3:]), # /Bucket-name/path/filename
+            Bucket=filepath.split("/")[3], # Destination bucket
+            Key= "/".join(filepath.split("/")[4:])[:-4] + 'ORIGINAL' + '.pdf' # Destination path/filename
+        )
+        uploadresponse = requests.put(
+            filepath,
+            data= flattened_buffer.getvalue(), #buffer.getvalue(),
+            auth=auth
+        )
+        uploadresponse.raise_for_status()
+    return pagecount
+
+def __flattenfitz(docbytesarr):
+    doc = fitz.open(stream=BytesIO(docbytesarr))
+    out = fitz.open()  # output PDF
+    for page in doc:
+        w, h = page.rect.br  # page width / height taken from bottom right point coords
+        outpage = out.new_page(width=w, height=h)  # out page has same dimensions
+        pix = page.get_pixmap(dpi=150)  # set desired resolution
+        outpage.insert_image(page.rect, pixmap=pix)
+    # Saving the flattened PDF to a buffer
+    buffer = BytesIO()
+    out.save(buffer, garbage=3, deflate=True)
+    buffer.seek(0)  # Reset the buffer to the beginning
+    return buffer
+
+def __rendercommentsonnewpage(comments,pagecount,writer,parameters):
+    try:
+        comments_pdf = BytesIO()
+        c = canvas.Canvas(comments_pdf, pagesize=letter)
+        font = parameters.get("font")
+        font_size = parameters.get("fontsize")       
+        width = parameters.get("width")
+        height = parameters.get("height")
+        currentpagesize = (width, height)
+        comment_pages = split_comments_to_pages(comments, font, font_size, c, lines_per_page=50)
+        for comment_page in comment_pages:
+            text = c.beginText(40, 750)
+            text.setFont(font, font_size)
+            for line in comment_page:
+                text.textLine(line)
+            c.setPageSize(currentpagesize)
+            c.drawText(text)
+            c.showPage()
+            pagecount += 1
+        c.save()
+        comments_pdf.seek(0)
+        comments_pdf_reader = PyPDF2.PdfReader(comments_pdf)
+        writer.add_page(comments_pdf_reader.pages[0])  # Add comments as a new page
+        return pagecount,writer
+    except Exception as e:
+        print(f"Error in rendering comments on new page in pdf: {e}")
+
+def createpagesforcomments(page, page_num, writer, reader2, pagecount):
+    # Check if the page contains annotations
+    if "/Annots" in page:
+        comments = []
+        annotations = page["/Annots"]
+        # Create a new PDF overlay with reportlab to draw annotation content
+        annotation_overlay = BytesIO()
+        c = canvas.Canvas(annotation_overlay, pagesize=letter)
+        pagenum=page_num + 1
+        for annot in annotations:
+            annotation_obj = annot.get_object()
+            subtype = annotation_obj["/Subtype"]
+            #print("\nAnnotation Object:", annotation_obj)
+            #Flatten comments - collect all the annots
+            if subtype == "/Text" and "/Contents" in annotation_obj:
+                comment = annotation_obj["/Contents"]
+                comments.append({
+                    'page': pagenum,#page_num + 1,
+                    'text': comment
+                })
+        # Finalize annotation overlay for the page
+        c.save()
+        annotation_overlay.seek(0)
+        # Merge the overlay (annotations rendered as static) onto the original PDF page
+        overlay_pdf = PyPDF2.PdfReader(annotation_overlay)
+        if len(overlay_pdf.pages) > 0:
+            overlay_page = overlay_pdf.pages[0]
+            page.merge_page(overlay_page)
+        writer.add_page(page)
+        if comments:
+            try:
+                parameters = get_page_properties(reader2, page_num)
+                # If there are comments, create an additional page for them
+                pagecount,writer=__rendercommentsonnewpage(comments,pagecount,writer,parameters)
+            except Exception as e:
+                print(f"Error in rendering comments on new page in pdf: {e}")
+    else:
+        writer.add_page(page)
+    return pagecount, writer
+
 def gets3documenthashcode(producermessage):
     s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)    
     s3_access_key_id = s3credentials.s3accesskey
@@ -173,9 +376,14 @@ def gets3documenthashcode(producermessage):
                         "Content-Type": "application/json",
                     }
                 )
-                saveresponse.raise_for_status()
+                saveresponse.raise_for_status()   
         fitz_reader.close()
-
+        # clear metadata
+        try:
+            pagecount= _clearmetadata(response, pagecount, reader, s3_access_key_id,s3_secret_access_key,filepath,auth)
+        except Exception as e:
+            print(f"Exception while clearing metadata/flattening: {e}")
+
     elif extension.lower() in file_conversion_types:
         # "Extension different {0}, so need to download pdf here for pagecount!!".format(extension))
         pdfresponseofconverted = requests.get(
@@ -193,3 +401,22 @@ def gets3documenthashcode(producermessage):
         sig.update(line)
 
     return (sig.hexdigest(), pagecount)
+
+
+def get_page_properties(original_pdf, pagenum, font="BC-Sans") -> dict:
+    """Getting parameters of previous page for new page"""
+    width = original_pdf.pages[pagenum].mediabox.width  
+    height = original_pdf.pages[pagenum].mediabox.height
+    if height < 450:
+        fontsize=10
+    else:
+        fontsize=12
+    return {
+        "width": width,
+        "height": height,
+        "fontsize": fontsize,
+        "font": font,
+        "numberofpages": len(original_pdf.pages),
+    }
+
+
diff --git a/computingservices/DedupeServices/utils/foidedupeconfig.py b/computingservices/DedupeServices/utils/foidedupeconfig.py
@@ -18,7 +18,6 @@
 dedupe_db_user = os.getenv("DEDUPE_DB_USER")
 dedupe_db_password = os.getenv("DEDUPE_DB_PASSWORD")
 
-dedupe_s3_host = os.getenv("DEDUPE_S3_HOST")
 dedupe_s3_host = os.getenv("DEDUPE_S3_HOST")
 dedupe_s3_region = os.getenv("DEDUPE_S3_REGION")
 dedupe_s3_service = os.getenv("DEDUPE_S3_SERVICE")

diff --git a/computingservices/DedupeServices/utils/fonts/BCSans-Regular_2f.ttf b/computingservices/DedupeServices/utils/fonts/BCSans-Regular_2f.ttf
diff --git a/computingservices/DocumentServices/rstreamio/message/schemas/redactionsummary.py b/computingservices/DocumentServices/rstreamio/message/schemas/redactionsummary.py
@@ -30,7 +30,7 @@ def __init__(self, sorteddocuments, pkgdocuments) -> None:
 
 class RedactionSummaryMessage(object):
     def __init__(self, jobid, requestid, ministryrequestid, category, requestnumber, 
-                 bcgovcode, createdby, filestozip, finaloutput, attributes, summarydocuments ,redactionlayerid) -> None:
+                 bcgovcode, createdby, filestozip, finaloutput, attributes, summarydocuments ,redactionlayerid,requesttype) -> None:
         self.jobid = jobid
         self.requestid = requestid
         self.ministryrequestid = ministryrequestid
@@ -43,6 +43,7 @@ def __init__(self, jobid, requestid, ministryrequestid, category, requestnumber,
         self.attributes = attributes
         self.summarydocuments = summarydocuments
         self.redactionlayerid = redactionlayerid
+        self.requesttype = requesttype
 
 
 def get_in_redactionsummary_msg(producer_json):