diff --git a/api/reviewer_api/models/DocumentMaster.py b/api/reviewer_api/models/DocumentMaster.py index 5a850a79f..920e00d38 100644 --- a/api/reviewer_api/models/DocumentMaster.py +++ b/api/reviewer_api/models/DocumentMaster.py @@ -208,7 +208,7 @@ def getdocumentproperty(cls, ministryrequestid, deleted): documentmasters = [] try: sql = """select dm.documentmasterid, dm.processingparentid, d.documentid, d.version, - dhc.rank1hash, d.filename, d.pagecount, d.attributes, dm.parentid from "DocumentMaster" dm, + dhc.rank1hash, d.filename, d.pagecount, dm.parentid from "DocumentMaster" dm, "Documents" d, "DocumentHashCodes" dhc where dm.ministryrequestid = :ministryrequestid and dm.ministryrequestid = d.foiministryrequestid and dm.documentmasterid = d.documentmasterid @@ -216,7 +216,7 @@ def getdocumentproperty(cls, ministryrequestid, deleted): rs = db.session.execute(text(sql), {'ministryrequestid': ministryrequestid}) for row in rs: if (row["processingparentid"] is not None and row["processingparentid"] not in deleted) or (row["processingparentid"] is None and row["documentmasterid"] not in deleted): - documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "documentattribute": row["attributes"], "parentid": row["parentid"], "version": row["version"]}) + documentmasters.append({"documentmasterid": row["documentmasterid"], "processingparentid": row["processingparentid"], "documentid": row["documentid"], "rank1hash": row["rank1hash"], "filename": row["filename"], "pagecount": row["pagecount"], "parentid": row["parentid"], "version": row["version"]}) except Exception as ex: logging.error(ex) db.session.close() diff --git a/api/reviewer_api/resources/foiflowmasterdata.py b/api/reviewer_api/resources/foiflowmasterdata.py index 27dc9528e..b95e0b2c0 100644 --- a/api/reviewer_api/resources/foiflowmasterdata.py +++ b/api/reviewer_api/resources/foiflowmasterdata.py @@ -129,14 +129,11 @@ def post(): ) documentobjs = [] + documentids = [documentinfo["file"]["documentid"] for documentinfo in data["documentobjs"]] + documents = documentservice().getdocumentbyids(documentids) for documentinfo in data["documentobjs"]: - filepath = "/".join(documentinfo["file"]["filepath"].split("/")[4:]) - if documentinfo["file"]["processedfilepath"]: - filepath = "/".join(documentinfo["file"]["processedfilepath"].split("/")[4:]) + filepath = "/".join(documents[documentinfo["file"]["documentid"]].split("/")[4:]) filename, file_extension = os.path.splitext(filepath) - if file_extension in FILE_CONVERSION_FILE_TYPES: - filepath = filename + ".pdf" - documentinfo["s3url"] = s3client.generate_presigned_url( ClientMethod="get_object", Params={ @@ -315,9 +312,9 @@ def post(ministryrequestid): # for save/put - stitch by division div["s3path_save"] = s3path_save for doc in div["documentlist"]: - filepathlist = doc["filepath"].split("/")[4:] - if doc["processedfilepath"]: - filepathlist = doc["processedfilepath"].split("/")[4:] + realfilepath = documentservice().getfilepathbydocumentid(doc["documentid"]) + # filepathlist = doc["filepath"].split("/")[4:] + filepathlist = realfilepath.split("/")[4:] # for load/get filepath_get = "/".join(filepathlist) diff --git a/api/reviewer_api/services/documentservice.py b/api/reviewer_api/services/documentservice.py index b460e213d..a6c4fc6ab 100644 --- a/api/reviewer_api/services/documentservice.py +++ b/api/reviewer_api/services/documentservice.py @@ -74,7 +74,6 @@ def __updateproperties( _att_in_properties = [] ( record["pagecount"], - record["processedpagecount"], record["filename"], record["documentid"], record["version"], @@ -121,7 +120,6 @@ def __updateproperties( ( attachment["pagecount"], - attachment["processedpagecount"], attachment["filename"], attachment["documentid"], attachment["version"], @@ -140,15 +138,9 @@ def __filterrecords(self, records): if record["recordid"] is None: attchments.append(record) return parentrecords, parentswithattachments, attchments - - def __getprocessedpagecount(self, property, pagecount): - if property["documentattribute"]: - return property["documentattribute"].get("processedpagecount", pagecount) - return pagecount - + def __getpagecountandfilename(self, record, properties): pagecount = 0 - processedpagecount = 0 filename = record["filename"] if "filename" in record else None documentid = None version = 0 @@ -158,11 +150,10 @@ def __getpagecountandfilename(self, record, properties): and record["documentmasterid"] == property["documentmasterid"] ): pagecount = property["pagecount"] - processedpagecount = self.__getprocessedpagecount(property, pagecount) filename = property["filename"] documentid = property["documentid"] version = property["version"] - return pagecount, processedpagecount, filename, documentid, version + return pagecount, filename, documentid, version def __getduplicatemsgattachment(self, records, attachmentproperties, attachment): _occurances = [] @@ -411,9 +402,7 @@ def updatedocumentattributes(self, payload, userid): return DocumentAttributes.update(newRows, oldRows) - def __getprocessedfilepath(self, attributes): - return attributes.get("processedfilepath", None) - + def getdocuments(self, requestid,bcgovcode): divisions_data = requests.request( method='GET', @@ -464,8 +453,6 @@ def getdocuments(self, requestid,bcgovcode): for documentid in documents: document = documents[documentid] - documentattributes = document["attributes"] - document["processedfilepath"] = self.__getprocessedfilepath(documentattributes) documentdivisions = set( map(lambda d: d["divisionid"], document["attributes"]["divisions"]) ) diff --git a/computingservices/DedupeServices/services/dedupedbservice.py b/computingservices/DedupeServices/services/dedupedbservice.py index 505d54938..92d68febf 100644 --- a/computingservices/DedupeServices/services/dedupedbservice.py +++ b/computingservices/DedupeServices/services/dedupedbservice.py @@ -3,31 +3,27 @@ from datetime import datetime import json -def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1, processedfilepath="", processedpagecount = 1): +def savedocumentdetails(dedupeproducermessage, hashcode, pagecount = 1): conn = getdbconnection() try: cursor = conn.cursor() - + _incompatible = True if str(dedupeproducermessage.incompatible).lower() == 'true' else False - attributes = {"processedpagecount": processedpagecount} if processedpagecount > 1 else None cursor.execute('INSERT INTO public."Documents" (version, \ - filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount,attributes) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer,%s) RETURNING documentid;', + filename, documentmasterid,foiministryrequestid,createdby,created_at,statusid,incompatible,pagecount) VALUES(%s::integer, %s, %s,%s::integer,%s,%s,%s::integer,%s::bool,%s::integer) RETURNING documentid;', (1, dedupeproducermessage.filename, dedupeproducermessage.outputdocumentmasterid or dedupeproducermessage.documentmasterid, - dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount, json.dumps(attributes))) + dedupeproducermessage.ministryrequestid,'{"user":"dedupeservice"}',datetime.now(),1,_incompatible,pagecount)) conn.commit() id_of_new_row = cursor.fetchone() - documentattribute = dedupeproducermessage.attributes - if processedfilepath: - documentattribute["processedfilepath"] = processedfilepath if (dedupeproducermessage.attributes.get('isattachment', False) and dedupeproducermessage.trigger == 'recordreplace'): documentmasterid = dedupeproducermessage.originaldocumentmasterid or dedupeproducermessage.documentmasterid else: documentmasterid = dedupeproducermessage.documentmasterid cursor.execute('''UPDATE public."DocumentAttributes" SET attributes = %s WHERE documentmasterid = %s''', - (json.dumps(documentattribute), documentmasterid)) + (json.dumps(dedupeproducermessage.attributes), documentmasterid)) conn.commit() cursor.execute('INSERT INTO public."DocumentHashCodes" (documentid, \ diff --git a/computingservices/DedupeServices/services/dedupeservice.py b/computingservices/DedupeServices/services/dedupeservice.py index 99a2928dc..d434c737b 100644 --- a/computingservices/DedupeServices/services/dedupeservice.py +++ b/computingservices/DedupeServices/services/dedupeservice.py @@ -7,8 +7,8 @@ def processmessage(message): recordjobstart(message) try: - hashcode, _pagecount, _processedpagecount, _processedfilepath = gets3documenthashcode(message) - savedocumentdetails(message, hashcode, _pagecount, _processedfilepath, _processedpagecount) + hashcode, _pagecount = gets3documenthashcode(message) + savedocumentdetails(message, hashcode, _pagecount) recordjobend(message, False) updateredactionstatus(message) except(Exception) as error: diff --git a/computingservices/DedupeServices/services/s3documentservice.py b/computingservices/DedupeServices/services/s3documentservice.py index dd1276d98..96620fdf2 100644 --- a/computingservices/DedupeServices/services/s3documentservice.py +++ b/computingservices/DedupeServices/services/s3documentservice.py @@ -8,7 +8,6 @@ import requests from aws_requests_auth.aws_auth import AWSRequestsAuth from pypdf import PdfReader, PdfWriter -import fitz from io import BytesIO from html import escape import hashlib @@ -22,8 +21,7 @@ dedupe_s3_service, dedupe_s3_env, request_management_api, - file_conversion_types, - convert_to_pst + file_conversion_types ) def __getcredentialsbybcgovcode(bcgovcode): @@ -51,135 +49,6 @@ def __getcredentialsbybcgovcode(bcgovcode): return s3cred -def savedocumenttos3(pdfwithannotations, s3uripath, auth): - uploadresponse = requests.put(s3uripath, data=pdfwithannotations, auth=auth) - uploadresponse.raise_for_status() - -def __append_if_exists(text, key, value): - if value: - text += f"{key}: {value}\n" - return text - -def extract_annotations_from_pdf(pdf_document, output_bytestream): - all_annotations = [] - output_pdf = fitz.open() - for page_num in range(pdf_document.page_count): - page = pdf_document.load_page(page_num) - index = 1 - annotations = page.annots() - for annot in annotations: - - content = annot.info.get('content', '') - if content: - legend_text = f"Legend [{page_num}:{str(index)}]" - new_content = legend_text + ":The comment text of the annotation is added as part of the pdf." - index += 1 - author = annot.info.get('title', '') - new_author = "Original Document Comment" - annot.set_info(content=new_content,title=new_author) - annot.update() - annot_dict = { - 'Legend': legend_text, - 'OriginalContent': content, - 'Author': author, - 'Subject': annot.info.get('subject', ''), - 'PageNumber': page_num, - # 'CreationDate': annot.info.get('creationDate', ''), - # 'ModDate': annot.info.get('modDate', ''), - # 'Type': annot.type[1] - } - all_annotations.append(annot_dict) - else: - page.delete_annot(annot) - output_pdf.insert_pdf(pdf_document) - if output_pdf: - output_pdf.save(output_bytestream) - return all_annotations - - -def __constructannotationtext(annot): - # Construct annotation text - annot_text = "" - - annot_text = __append_if_exists(annot_text, 'Legend', annot["Legend"]) - annot_text = __append_if_exists(annot_text, 'Subject', annot["Subject"]) - annot_text = __append_if_exists(annot_text, 'Author', annot["Author"]) - annot_text = __append_if_exists(annot_text, 'Original Content', annot["OriginalContent"]) - # creationdate = convert_to_pst(annot['CreationDate']) if annot['CreationDate'] else '' - # moddate = convert_to_pst(annot['ModDate']) if annot['ModDate'] else '' - # annot_text = __append_if_exists(annot_text, 'Annotation Type', annot["Type"]) - # annot_text = __append_if_exists(annot_text, 'ModifiedContent', annot["ModifiedContent"]) - # annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate) - # annot_text = __append_if_exists(annot_text, 'Modified Date', moddate) - annot_text += "\n" - return annot_text - -def add_annotations_as_text_to_pdf(source_document, bytes_stream): - output_bytestream = BytesIO() - annotations = extract_annotations_from_pdf(source_document, output_bytestream) - updated_stream = output_bytestream.getvalue() - updated_document = fitz.open(stream=updated_stream) - processedpagecount = 1 - destination_document = fitz.open() - text_line_spacing = 15 - page_height = 792 - new_page_index = 0 - for page_index in range(updated_document.page_count): - if new_page_index == 0: - new_page_index = page_index - text_start_position = 50 - annotations_on_page = [annot for annot in annotations if annot.get('PageNumber') == page_index] - for annot in annotations_on_page: - annot_text = __constructannotationtext(annot) - lines_needed = len(annot_text.split('\n')) - if text_start_position == 50: - new_page_index += 1 - updated_document.insert_page(new_page_index) - new_page = updated_document.load_page(new_page_index) - if text_start_position + lines_needed * text_line_spacing > page_height - 50: - new_page_index += 1 - updated_document.insert_page(new_page_index) - new_page = updated_document.load_page(new_page_index) - text_start_position = 50 - try: - new_page.insert_text((50, text_start_position), annot_text, fontsize=10) - except Exception as e: - print(f"Error occurred while inserting text: {e}") - text_start_position += lines_needed * text_line_spacing - new_page_index += 1 - - destination_document.insert_pdf(updated_document) - - if destination_document: - processedpagecount = destination_document.page_count - destination_document.save(bytes_stream) - destination_document.close() - del destination_document - return processedpagecount - -def handleannotationsinpdf(_bytes, filepath, extension, auth): - try: - bytes_stream = BytesIO() - s3uripath = "" - source_document = fitz.open(stream=_bytes) - processedpagecount = 1 - has_annots = source_document.has_annots() - if has_annots: - processedpagecount = add_annotations_as_text_to_pdf(source_document, bytes_stream) - _updatedbytes = bytes_stream.getvalue() - if source_document: - source_document.close() - if len(_updatedbytes) > 0: - # new filename with existing guid filename_updated - s3uripath = path.splitext(filepath)[0] + "_updated" + extension - savedocumenttos3(_updatedbytes, s3uripath, auth) - if bytes_stream: - bytes_stream.close() - del bytes_stream - return processedpagecount, s3uripath - except Exception as e: - print(f"Error occurred while processing pdf with annotations: {e}") - def gets3documenthashcode(producermessage): s3credentials = __getcredentialsbybcgovcode(producermessage.bcgovcode) s3_access_key_id = s3credentials.s3accesskey @@ -193,8 +62,6 @@ def gets3documenthashcode(producermessage): ) pagecount = 1 - processedpagecount = 1 - processedfilepath = "" _filename, extension = path.splitext(producermessage.filename) filepath = producermessage.s3filepath producermessage.attributes = json.loads(producermessage.attributes) @@ -206,9 +73,9 @@ def gets3documenthashcode(producermessage): response = requests.get("{0}".format(filepath), auth=auth, stream=True) reader = None if extension.lower() in [".pdf"]: - _bytes = BytesIO(response.content) - processedpagecount, processedfilepath = handleannotationsinpdf(_bytes, filepath, extension, auth) - reader = PdfReader(_bytes) + reader = PdfReader(BytesIO(response.content)) + + # "No of pages in {0} is {1} ".format(_filename, len(reader.pages))) pagecount = len(reader.pages) attachments = [] if reader.attachments: @@ -265,4 +132,4 @@ def gets3documenthashcode(producermessage): for line in response.iter_lines(): sig.update(line) - return (sig.hexdigest(), pagecount, processedpagecount, processedfilepath) + return (sig.hexdigest(), pagecount) diff --git a/computingservices/DedupeServices/utils/__init__.py b/computingservices/DedupeServices/utils/__init__.py index 495ffdd61..ec222b146 100644 --- a/computingservices/DedupeServices/utils/__init__.py +++ b/computingservices/DedupeServices/utils/__init__.py @@ -3,4 +3,3 @@ from .foidedupeconfig import * from .jsonmessageparser import getdedupeproducermessage,gets3credentialsobject from .dbconnection import getdbconnection -from .commons.datetimehandler import convert_to_pst diff --git a/computingservices/DedupeServices/utils/commons/datetimehandler.py b/computingservices/DedupeServices/utils/commons/datetimehandler.py deleted file mode 100644 index 2b7b9f887..000000000 --- a/computingservices/DedupeServices/utils/commons/datetimehandler.py +++ /dev/null @@ -1,24 +0,0 @@ - -from datetime import datetime, timedelta -import pytz - -def convert_to_pst(datetime_str): - # Extract date and time parts from the string - date_str = datetime_str[2:10] - time_str = datetime_str[10:16] - - # Parse the date and time strings - parsed_datetime = datetime.strptime(f"{date_str} {time_str}", "%Y%m%d %H%M%S") - - # Extract the timezone offset and convert to timedelta - offset_str = datetime_str[-6:].replace("'", "") - offset_hours = int(offset_str[:-2]) - offset_minutes = int(offset_str[-2:]) - offset_delta = timedelta(hours=offset_hours, minutes=offset_minutes) - - # Apply the UTC offset - pst_timezone = pytz.timezone('America/Los_Angeles') - utc_datetime = parsed_datetime - offset_delta - pst_datetime = utc_datetime.astimezone(pst_timezone) - - return pst_datetime diff --git a/web/src/components/FOI/Home/DocumentSelector.tsx b/web/src/components/FOI/Home/DocumentSelector.tsx index 84af47144..faefd2c9a 100644 --- a/web/src/components/FOI/Home/DocumentSelector.tsx +++ b/web/src/components/FOI/Home/DocumentSelector.tsx @@ -97,7 +97,7 @@ const DocumentSelector = React.forwardRef(({ useEffect(() => { - let refLength = documents.reduce((acc: any, file: any) => acc + file.processedpagecount, 0); + let refLength = documents.reduce((acc: any, file: any) => acc + file.pagecount, 0); pageRefs.current = Array(refLength).fill(0).map((_, i) => pageRefs.current[i] || createRef()); }, [documents]) @@ -192,7 +192,7 @@ const DocumentSelector = React.forwardRef(({ if (filterFlags.length > 0 && filterFlags.includes(0)) { filesForDisplay?.forEach((file: any) => { let flagedpages = file.pageFlag ? file.pageFlag.length : 0; - unflagged += file.processedpagecount - flagedpages; + unflagged += file.pagecount - flagedpages; }); } @@ -387,7 +387,7 @@ const DocumentSelector = React.forwardRef(({ } else - setFilesForDisplay(filteredFiles.filter((file: any) => ((filters.includes(0) && (typeof file.pageFlag === "undefined" || file.pageFlag?.length == 0 || file.processedpagecount != file.pageFlag?.length)) + setFilesForDisplay(filteredFiles.filter((file: any) => ((filters.includes(0) && (typeof file.pageFlag === "undefined" || file.pageFlag?.length == 0 || file.pagecount != file.pageFlag?.length)) || (file.pageFlag?.find((obj: any) => ((obj.flagid != 4 && filters.includes(obj.flagid)))))) )); } @@ -602,7 +602,7 @@ const DocumentSelector = React.forwardRef(({ disableHoverListener={disableHover} > - {[...Array(file.processedpagecount)].map((_x, p) => + {[...Array(file.pagecount)].map((_x, p) => (filterFlags.length > 0 ? consulteeFilterView(file,p) : @@ -650,7 +650,7 @@ const DocumentSelector = React.forwardRef(({ > - {[...Array(file.processedpagecount)].map((_x, p) => + {[...Array(file.pagecount)].map((_x, p) => (filterFlags.length > 0 ? consulteeFilterView(file,p,division) : diff --git a/web/src/components/FOI/Home/Home.js b/web/src/components/FOI/Home/Home.js index 48253b6ba..f470572c4 100644 --- a/web/src/components/FOI/Home/Home.js +++ b/web/src/components/FOI/Home/Home.js @@ -66,7 +66,7 @@ function Home() { let urlPromises = []; _files.forEach((file, index) => { documentObjs.push({ file: file, s3url: "" }); - let filePageCount = file?.processedpagecount; + let filePageCount = file?.pagecount; totalPageCountVal += filePageCount; }); diff --git a/web/src/components/FOI/Home/Redlining.js b/web/src/components/FOI/Home/Redlining.js index 71651b2f4..69de17ade 100644 --- a/web/src/components/FOI/Home/Redlining.js +++ b/web/src/components/FOI/Home/Redlining.js @@ -165,7 +165,7 @@ const Redlining = React.forwardRef( if (pageFlags?.length > 0) { pageFlags.every((pageFlagInfo) => { if (docInfo.documentid == pageFlagInfo?.documentid) { - if (docInfo.processedpagecount > pageFlagInfo.pageflag.length) { + if (docInfo.pagecount > pageFlagInfo.pageflag.length) { // not all page has flag set stopLoop = true; return false; //stop loop @@ -1128,7 +1128,7 @@ const Redlining = React.forwardRef( let mappedDocs = { stitchedPageLookup: {}, docIdLookup: {}, redlineDocIdLookup: {} }; let mappedDoc = { docId: 0, version: 0, division: "", pageMappings: [] }; let domParser = new DOMParser(); - for (let i = 0; i < removedFirstElement.file.processedpagecount; i++) { + for (let i = 0; i < removedFirstElement.file.pagecount; i++) { let firstDocMappings = { pageNo: i + 1, stitchedPageNo: i + 1 }; mappedDocs["stitchedPageLookup"][i + 1] = { docid: removedFirstElement.file.documentid, @@ -2181,7 +2181,7 @@ const Redlining = React.forwardRef( totalPageCount += Object.keys( pageMappings[doc.documentid] ).length; - totalPageCountIncludeRemoved += doc.processedpagecount; + totalPageCountIncludeRemoved += doc.pagecount; } @@ -2227,7 +2227,7 @@ const Redlining = React.forwardRef( totalPageCount += Object.keys( pageMappings[doc.documentid] ).length; - totalPageCountIncludeRemoved += doc.processedpagecount; + totalPageCountIncludeRemoved += doc.pagecount; //} } @@ -2364,7 +2364,7 @@ const Redlining = React.forwardRef( } else { // create an array containing 1…N let pages = Array.from( - { length: doc.processedpagecount }, + { length: doc.pagecount }, (v, k) => k + 1 ); let pageIndexToInsert = stitchedDocObj?.getPageCount() + 1;