Skip to content

Commit

Permalink
Merge pull request #1135 from bcgov/dev-AA-FOIMOD-3439
Browse files Browse the repository at this point in the history
#FOIMOD-3439, removing sensitve info from PDF doc, Zipper services
  • Loading branch information
abin-aot authored Sep 5, 2024
2 parents fe49cec + aa3ad6a commit bda7e52
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,5 @@ bld/
MCS.FOI.S3FileConversion/MCS.FOI.S3FileConversion/QtBinariesWindows/
computingservices/ZippingServices/env/*
openshift/templates/zippingservice/zipper.env
*.locenv

Binary file modified computingservices/ZippingServices/requirements.txt
Binary file not shown.
27 changes: 26 additions & 1 deletion computingservices/ZippingServices/services/zipperservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .notificationservice import notificationservice
import json
import traceback
import PyPDF2


def processmessage(message):
Expand Down Expand Up @@ -110,8 +111,16 @@ def __zipfilesandupload(_message, s3credentials):
for fileobj in _jsonfiles:
filename = fileobj["filename"]
print("\nfilename:",filename)

_docbytes = __getdocumentbytearray(fileobj, s3credentials)
_formattedbytes = None

try:
_formattedbytes = __removesensitivecontent(_docbytes)
except Exception:
print(traceback.format_exc())
zip.writestr(
filename, __getdocumentbytearray(fileobj, s3credentials)
filename, _docbytes if _formattedbytes is None else _formattedbytes
)

tp.seek(0)
Expand All @@ -136,6 +145,22 @@ def __zipfilesandupload(_message, s3credentials):
finally:
zipped_bytes = None

def __removesensitivecontent(documentbytes):
# clear metadata
reader2 = PyPDF2.PdfReader(BytesIO(documentbytes))
# Check if metadata exists.
if reader2.metadata is not None:
# Create a new PDF file without metadata.
writer = PyPDF2.PdfWriter()
# Copy pages from the original PDF to the new PDF.
for page_num in range(len(reader2.pages)):
page = reader2.pages[page_num]
writer.add_page(page)
#writer.remove_links() # to remove comments.
buffer = BytesIO()
writer.write(buffer)
return buffer.getvalue()


def __getzipfilepath(foldername, filename):
return (
Expand Down

0 comments on commit bda7e52

Please sign in to comment.