Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revert flattening pdf changes #679

Merged
merged 7 commits into from
Dec 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions api/reviewer_api/models/DocumentMaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,15 @@ def getdocumentproperty(cls, ministryrequestid, deleted):
documentmasters = []
try:
sql = """select dm.documentmasterid, dm.processingparentid, d.documentid, d.version,
dhc.rank1hash, d.filename, d.pagecount, d.attributes, dm.parentid from "DocumentMaster" dm,
dhc.rank1hash, d.filename, d.pagecount, dm.parentid from "DocumentMaster" dm,
"Documents" d, "DocumentHashCodes" dhc
where dm.ministryrequestid = :ministryrequestid and dm.ministryrequestid = d.foiministryrequestid
and dm.documentmasterid = d.documentmasterid
and d.documentid = dhc.documentid order by dm.documentmasterid;"""
rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid})
for row in rs:
if (row["processingparentid"] is not None and row["processingparentid"] not in deleted) or (row["processingparentid"] is None and row["documentmasterid"] not in deleted):
documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "documentattribute": row["attributes"], "parentid": row["parentid"], "version": row["version"]})
documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "parentid": row["parentid"], "version": row["version"]})
except Exception as ex:
logging.error(ex)
db.session.close()
Expand Down
15 changes: 6 additions & 9 deletions api/reviewer_api/resources/foiflowmasterdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,11 @@ def post():
)

documentobjs = []
documentids = [documentinfo["file"]["documentid"] for documentinfo in data["documentobjs"]]
documents = documentservice().getdocumentbyids(documentids)
for documentinfo in data["documentobjs"]:
filepath = "/".join(documentinfo["file"]["filepath"].split("/")[4:])
if documentinfo["file"]["processedfilepath"]:
filepath = "/".join(documentinfo["file"]["processedfilepath"].split("/")[4:])
filepath = "/".join(documents[documentinfo["file"]["documentid"]].split("/")[4:])
filename, file_extension = os.path.splitext(filepath)
if file_extension in FILE_CONVERSION_FILE_TYPES:
filepath = filename + ".pdf"

documentinfo["s3url"] = s3client.generate_presigned_url(
ClientMethod="get_object",
Params={
Expand Down Expand Up @@ -315,9 +312,9 @@ def post(ministryrequestid):
# for save/put - stitch by division
div["s3path_save"] = s3path_save
for doc in div["documentlist"]:
filepathlist = doc["filepath"].split("/")[4:]
if doc["processedfilepath"]:
filepathlist = doc["processedfilepath"].split("/")[4:]
realfilepath = documentservice().getfilepathbydocumentid(doc["documentid"])
# filepathlist = doc["filepath"].split("/")[4:]
filepathlist = realfilepath.split("/")[4:]

# for load/get
filepath_get = "/".join(filepathlist)
Expand Down
19 changes: 3 additions & 16 deletions api/reviewer_api/services/documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def __updateproperties(
_att_in_properties = []
(
record["pagecount"],
record["processedpagecount"],
record["filename"],
record["documentid"],
record["version"],
Expand Down Expand Up @@ -121,7 +120,6 @@ def __updateproperties(

(
attachment["pagecount"],
attachment["processedpagecount"],
attachment["filename"],
attachment["documentid"],
attachment["version"],
Expand All @@ -140,15 +138,9 @@ def __filterrecords(self, records):
if record["recordid"] is None:
attchments.append(record)
return parentrecords, parentswithattachments, attchments

def __getprocessedpagecount(self, property, pagecount):
if property["documentattribute"]:
return property["documentattribute"].get("processedpagecount", pagecount)
return pagecount


def __getpagecountandfilename(self, record, properties):
pagecount = 0
processedpagecount = 0
filename = record["filename"] if "filename" in record else None
documentid = None
version = 0
Expand All @@ -158,11 +150,10 @@ def __getpagecountandfilename(self, record, properties):
and record["documentmasterid"] == property["documentmasterid"]
):
pagecount = property["pagecount"]
processedpagecount = self.__getprocessedpagecount(property, pagecount)
filename = property["filename"]
documentid = property["documentid"]
version = property["version"]
return pagecount, processedpagecount, filename, documentid, version
return pagecount, filename, documentid, version

def __getduplicatemsgattachment(self, records, attachmentproperties, attachment):
_occurances = []
Expand Down Expand Up @@ -411,9 +402,7 @@ def updatedocumentattributes(self, payload, userid):

return DocumentAttributes.update(newRows, oldRows)

def __getprocessedfilepath(self, attributes):
return attributes.get("processedfilepath", None)


def getdocuments(self, requestid,bcgovcode):
divisions_data = requests.request(
method='GET',
Expand Down Expand Up @@ -464,8 +453,6 @@ def getdocuments(self, requestid,bcgovcode):

for documentid in documents:
document = documents[documentid]
documentattributes = document["attributes"]
document["processedfilepath"] = self.__getprocessedfilepath(documentattributes)
documentdivisions = set(
map(lambda d: d["divisionid"], document["attributes"]["divisions"])
)
Expand Down
14 changes: 5 additions & 9 deletions computingservices/DedupeServices/services/dedupedbservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,27 @@
from datetime import datetime
import json

def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1, processedfilepath="", processedpagecount = 1):
def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1):
conn = getdbconnection()
try:
cursor = conn.cursor()

_incompatible = True if str(dedupeproducermessage.incompatible).lower() == 'true' else False

attributes = {"processedpagecount": processedpagecount} if processedpagecount > 1 else None
cursor.execute('INSERT INTO public."Documents" (version, \
filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount,attributes) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer,%s) RETURNING documentid;',
filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer) RETURNING documentid;',
(1, dedupeproducermessage.filename, dedupeproducermessage.outputdocumentmasterid or dedupeproducermessage.documentmasterid,
dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount, json.dumps(attributes)))
dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount))
conn.commit()
id_of_new_row = cursor.fetchone()

documentattribute = dedupeproducermessage.attributes
if processedfilepath:
documentattribute["processedfilepath"] = processedfilepath
if (dedupeproducermessage.attributes.get('isattachment', False) and dedupeproducermessage.trigger == 'recordreplace'):
documentmasterid = dedupeproducermessage.originaldocumentmasterid or dedupeproducermessage.documentmasterid
else:
documentmasterid = dedupeproducermessage.documentmasterid

cursor.execute('''UPDATE public."DocumentAttributes" SET attributes = %s WHERE documentmasterid = %s''',
(json.dumps(documentattribute), documentmasterid))
(json.dumps(dedupeproducermessage.attributes), documentmasterid))
conn.commit()

cursor.execute('INSERT INTO public."DocumentHashCodes" (documentid, \
Expand Down
4 changes: 2 additions & 2 deletions computingservices/DedupeServices/services/dedupeservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
def processmessage(message):
recordjobstart(message)
try:
hashcode, _pagecount, _processedpagecount, _processedfilepath = gets3documenthashcode(message)
savedocumentdetails(message, hashcode, _pagecount, _processedfilepath, _processedpagecount)
hashcode, _pagecount = gets3documenthashcode(message)
savedocumentdetails(message, hashcode, _pagecount)
recordjobend(message, False)
updateredactionstatus(message)
except(Exception) as error:
Expand Down
143 changes: 5 additions & 138 deletions computingservices/DedupeServices/services/s3documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import requests
from aws_requests_auth.aws_auth import AWSRequestsAuth
from pypdf import PdfReader, PdfWriter
import fitz
from io import BytesIO
from html import escape
import hashlib
Expand All @@ -22,8 +21,7 @@
dedupe_s3_service,
dedupe_s3_env,
request_management_api,
file_conversion_types,
convert_to_pst
file_conversion_types
)

def __getcredentialsbybcgovcode(bcgovcode):
Expand Down Expand Up @@ -51,135 +49,6 @@ def __getcredentialsbybcgovcode(bcgovcode):

return s3cred

def savedocumenttos3(pdfwithannotations, s3uripath, auth):
uploadresponse = requests.put(s3uripath, data=pdfwithannotations, auth=auth)
uploadresponse.raise_for_status()

def __append_if_exists(text, key, value):
if value:
text += f"{key}: {value}\n"
return text

def extract_annotations_from_pdf(pdf_document, output_bytestream):
all_annotations = []
output_pdf = fitz.open()
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
index = 1
annotations = page.annots()
for annot in annotations:

content = annot.info.get('content', '')
if content:
legend_text = f"Legend [{page_num}:{str(index)}]"
new_content = legend_text + ":The comment text of the annotation is added as part of the pdf."
index += 1
author = annot.info.get('title', '')
new_author = "Original Document Comment"
annot.set_info(content=new_content,title=new_author)
annot.update()
annot_dict = {
'Legend': legend_text,
'OriginalContent': content,
'Author': author,
'Subject': annot.info.get('subject', ''),
'PageNumber': page_num,
# 'CreationDate': annot.info.get('creationDate', ''),
# 'ModDate': annot.info.get('modDate', ''),
# 'Type': annot.type[1]
}
all_annotations.append(annot_dict)
else:
page.delete_annot(annot)
output_pdf.insert_pdf(pdf_document)
if output_pdf:
output_pdf.save(output_bytestream)
return all_annotations


def __constructannotationtext(annot):
# Construct annotation text
annot_text = ""

annot_text = __append_if_exists(annot_text, 'Legend', annot["Legend"])
annot_text = __append_if_exists(annot_text, 'Subject', annot["Subject"])
annot_text = __append_if_exists(annot_text, 'Author', annot["Author"])
annot_text = __append_if_exists(annot_text, 'Original Content', annot["OriginalContent"])
# creationdate = convert_to_pst(annot['CreationDate']) if annot['CreationDate'] else ''
# moddate = convert_to_pst(annot['ModDate']) if annot['ModDate'] else ''
# annot_text = __append_if_exists(annot_text, 'Annotation Type', annot["Type"])
# annot_text = __append_if_exists(annot_text, 'ModifiedContent', annot["ModifiedContent"])
# annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate)
# annot_text = __append_if_exists(annot_text, 'Modified Date', moddate)
annot_text += "\n"
return annot_text

def add_annotations_as_text_to_pdf(source_document, bytes_stream):
output_bytestream = BytesIO()
annotations = extract_annotations_from_pdf(source_document, output_bytestream)
updated_stream = output_bytestream.getvalue()
updated_document = fitz.open(stream=updated_stream)
processedpagecount = 1
destination_document = fitz.open()
text_line_spacing = 15
page_height = 792
new_page_index = 0
for page_index in range(updated_document.page_count):
if new_page_index == 0:
new_page_index = page_index
text_start_position = 50
annotations_on_page = [annot for annot in annotations if annot.get('PageNumber') == page_index]
for annot in annotations_on_page:
annot_text = __constructannotationtext(annot)
lines_needed = len(annot_text.split('\n'))
if text_start_position == 50:
new_page_index += 1
updated_document.insert_page(new_page_index)
new_page = updated_document.load_page(new_page_index)
if text_start_position + lines_needed * text_line_spacing > page_height - 50:
new_page_index += 1
updated_document.insert_page(new_page_index)
new_page = updated_document.load_page(new_page_index)
text_start_position = 50
try:
new_page.insert_text((50, text_start_position), annot_text, fontsize=10)
except Exception as e:
print(f"Error occurred while inserting text: {e}")
text_start_position += lines_needed * text_line_spacing
new_page_index += 1

destination_document.insert_pdf(updated_document)

if destination_document:
processedpagecount = destination_document.page_count
destination_document.save(bytes_stream)
destination_document.close()
del destination_document
return processedpagecount

def handleannotationsinpdf(_bytes, filepath, extension, auth):
try:
bytes_stream = BytesIO()
s3uripath = ""
source_document = fitz.open(stream=_bytes)
processedpagecount = 1
has_annots = source_document.has_annots()
if has_annots:
processedpagecount = add_annotations_as_text_to_pdf(source_document, bytes_stream)
_updatedbytes = bytes_stream.getvalue()
if source_document:
source_document.close()
if len(_updatedbytes) > 0:
# new filename with existing guid filename_updated
s3uripath = path.splitext(filepath)[0] + "_updated" + extension
savedocumenttos3(_updatedbytes, s3uripath, auth)
if bytes_stream:
bytes_stream.close()
del bytes_stream
return processedpagecount, s3uripath
except Exception as e:
print(f"Error occurred while processing pdf with annotations: {e}")

def gets3documenthashcode(producermessage):
s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode)
s3_access_key_id = s3credentials.s3accesskey
Expand All @@ -193,8 +62,6 @@ def gets3documenthashcode(producermessage):
)

pagecount = 1
processedpagecount = 1
processedfilepath = ""
_filename, extension = path.splitext(producermessage.filename)
filepath = producermessage.s3filepath
producermessage.attributes = json.loads(producermessage.attributes)
Expand All @@ -206,9 +73,9 @@ def gets3documenthashcode(producermessage):
response = requests.get("{0}".format(filepath), auth=auth, stream=True)
reader = None
if extension.lower() in [".pdf"]:
_bytes = BytesIO(response.content)
processedpagecount, processedfilepath = handleannotationsinpdf(_bytes, filepath, extension, auth)
reader = PdfReader(_bytes)
reader = PdfReader(BytesIO(response.content))

# "No of pages in {0} is {1} ".format(_filename, len(reader.pages)))
pagecount = len(reader.pages)
attachments = []
if reader.attachments:
Expand Down Expand Up @@ -265,4 +132,4 @@ def gets3documenthashcode(producermessage):
for line in response.iter_lines():
sig.update(line)

return (sig.hexdigest(), pagecount, processedpagecount, processedfilepath)
return (sig.hexdigest(), pagecount)
1 change: 0 additions & 1 deletion computingservices/DedupeServices/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@
from .foidedupeconfig import *
from .jsonmessageparser import getdedupeproducermessage,gets3credentialsobject
from .dbconnection import getdbconnection
from .commons.datetimehandler import convert_to_pst
24 changes: 0 additions & 24 deletions computingservices/DedupeServices/utils/commons/datetimehandler.py

This file was deleted.

Loading
Loading