diff --git a/.gitignore b/.gitignore index 3e5f06c34..f9395bcd2 100644 --- a/.gitignore +++ b/.gitignore @@ -100,3 +100,5 @@ bld/ MCS.FOI.S3FileConversion/MCS.FOI.S3FileConversion/QtBinariesWindows/ computingservices/ZippingServices/env/* openshift/templates/zippingservice/zipper.env +*.locenv + diff --git a/computingservices/ZippingServices/requirements.txt b/computingservices/ZippingServices/requirements.txt index 6e9366875..a08f010df 100644 Binary files a/computingservices/ZippingServices/requirements.txt and b/computingservices/ZippingServices/requirements.txt differ diff --git a/computingservices/ZippingServices/services/zipperservice.py b/computingservices/ZippingServices/services/zipperservice.py index dddcc827d..5c343244e 100644 --- a/computingservices/ZippingServices/services/zipperservice.py +++ b/computingservices/ZippingServices/services/zipperservice.py @@ -14,6 +14,7 @@ from .notificationservice import notificationservice import json import traceback +import PyPDF2 def processmessage(message): @@ -110,8 +111,16 @@ def __zipfilesandupload(_message, s3credentials): for fileobj in _jsonfiles: filename = fileobj["filename"] print("\nfilename:",filename) + + _docbytes = __getdocumentbytearray(fileobj, s3credentials) + _formattedbytes = None + + try: + _formattedbytes = __removesensitivecontent(_docbytes) + except Exception: + print(traceback.format_exc()) zip.writestr( - filename, __getdocumentbytearray(fileobj, s3credentials) + filename, _docbytes if _formattedbytes is None else _formattedbytes ) tp.seek(0) @@ -136,6 +145,22 @@ def __zipfilesandupload(_message, s3credentials): finally: zipped_bytes = None +def __removesensitivecontent(documentbytes): + # clear metadata + reader2 = PyPDF2.PdfReader(BytesIO(documentbytes)) + # Check if metadata exists. + if reader2.metadata is not None: + # Create a new PDF file without metadata. + writer = PyPDF2.PdfWriter() + # Copy pages from the original PDF to the new PDF. + for page_num in range(len(reader2.pages)): + page = reader2.pages[page_num] + writer.add_page(page) + #writer.remove_links() # to remove comments. + buffer = BytesIO() + writer.write(buffer) + return buffer.getvalue() + def __getzipfilepath(foldername, filename): return (