Skip to content

Commit

Permalink
Merge pull request #656 from bcgov/dev-DV-4708
Browse files Browse the repository at this point in the history
PDF Annotation updates
  • Loading branch information
divyav-aot authored Dec 8, 2023
2 parents e8f65a5 + f992ba6 commit d0ff432
Showing 1 changed file with 69 additions and 47 deletions.
116 changes: 69 additions & 47 deletions computingservices/DedupeServices/services/s3documentservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,67 +62,88 @@ def __append_if_exists(text, key, value):
text += f"{key}: {value}\n"
return text

def __construct_annotation_text(annot, page):
annot_text = ""
def extract_annotations_from_pdf(pdf_document, output_bytestream):
all_annotations = []
output_pdf = fitz.open()
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
index = 1
annotations = page.annots()
for annot in annotations:

content = annot.info.get('content', '')
if content:
legend_text = f"Legend [{page_num}:{str(index)}]"
new_content = legend_text + ":The comment text of the annotation is added as part of the pdf."
index += 1
author = annot.info.get('title', '')
if author:
new_author = "Original Document Comment"
annot.set_info(content=new_content,title=new_author)
annot.update()
annot_dict = {
'Legend': legend_text,
'OriginalContent': content,
'Author': author,
'Subject': annot.info.get('subject', ''),
'PageNumber': page_num,
# 'CreationDate': annot.info.get('creationDate', ''),
# 'ModDate': annot.info.get('modDate', ''),
# 'Type': annot.type[1]
}
all_annotations.append(annot_dict)
else:
page.delete_annot(annot)
output_pdf.insert_pdf(pdf_document)
if output_pdf:
output_pdf.save(output_bytestream)
return all_annotations

# Extract required fields
name = annot.info.get('name')
content = annot.info.get('content')
title = annot.info.get('title')
subject = annot.info.get('subject')
creationdate = annot.info.get('creationDate', '')
creationdate = convert_to_pst(creationdate) if creationdate else ''
moddate = annot.info.get('modDate', '')
moddate = convert_to_pst(moddate) if moddate else ''

associatedtext = ""
# Check if annotation is a square(4), circle(5), polygon(6), highlight(8),
# underline(9), strikeOut(11), caret(14), ink/pencil draw(15)
if annot.type[0] in (4, 5, 6, 8, 9, 11, 14, 15) :
text = page.get_text("text", clip=annot.rect)
associatedtext = text

annot_text = __append_if_exists(annot_text, 'Annotation Type', annot.type[1])
annot_text = __append_if_exists(annot_text, 'Name', name)
annot_text = __append_if_exists(annot_text, 'Content', content)
annot_text = __append_if_exists(annot_text, 'Title', title)
annot_text = __append_if_exists(annot_text, 'Subject', subject)
annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate)
annot_text = __append_if_exists(annot_text, 'Modified Date', moddate)
annot_text = __append_if_exists(annot_text, 'Associated Text', associatedtext)

def __constructannotationtext(annot):
# Construct annotation text
annot_text = ""

annot_text = __append_if_exists(annot_text, 'Legend', annot["Legend"])
annot_text = __append_if_exists(annot_text, 'Subject', annot["Subject"])
annot_text = __append_if_exists(annot_text, 'Author', annot["Author"])
annot_text = __append_if_exists(annot_text, 'Original Content', annot["OriginalContent"])
# creationdate = convert_to_pst(annot['CreationDate']) if annot['CreationDate'] else ''
# moddate = convert_to_pst(annot['ModDate']) if annot['ModDate'] else ''
# annot_text = __append_if_exists(annot_text, 'Annotation Type', annot["Type"])
# annot_text = __append_if_exists(annot_text, 'ModifiedContent', annot["ModifiedContent"])
# annot_text = __append_if_exists(annot_text, 'Creation Date', creationdate)
# annot_text = __append_if_exists(annot_text, 'Modified Date', moddate)
annot_text += "\n"
return annot_text

def add_annotations_as_text_to_pdf(source_document, bytes_stream):
output_bytestream = BytesIO()
annotations = extract_annotations_from_pdf(source_document, output_bytestream)
updated_stream = output_bytestream.getvalue()
updated_document = fitz.open(stream=updated_stream)
processedpagecount = 1
destination_document = fitz.open()
text_line_spacing = 15
page_height = 792
new_page_index = 0
for page_index in range(source_document.page_count):
for page_index in range(updated_document.page_count):
if new_page_index == 0:
new_page_index = page_index
text_start_position = 50
source_page = source_document.load_page(page_index)
page_rotation = source_page.rotation
source_page.set_rotation(0)
source_width = source_page.rect.width
source_height = source_page.rect.height
new_page = destination_document.new_page(new_page_index,width=source_width, height=source_height)
new_page.show_pdf_page(new_page.rect, source_document, page_index)
new_page.set_rotation(page_rotation)
annotations = source_page.annots()

for annot in annotations:
annot_text = __construct_annotation_text(annot, source_page)
annotations_on_page = [annot for annot in annotations if annot.get('PageNumber') == page_index]
for annot in annotations_on_page:
annot_text = __constructannotationtext(annot)
lines_needed = len(annot_text.split('\n'))
print(f'annot_text = {annot_text}')
if text_start_position == 50:
new_page_index += 1
new_page = destination_document.new_page(new_page_index,width=source_width, height=source_height)

if text_start_position + lines_needed * text_line_spacing > source_height - 50:
updated_document.insert_page(new_page_index)
new_page = updated_document.load_page(new_page_index)
if text_start_position + lines_needed * text_line_spacing > page_height - 50:
new_page_index += 1
new_page = destination_document.new_page(new_page_index,width=source_width, height=source_height)
updated_document.insert_page(new_page_index)
new_page = updated_document.load_page(new_page_index)
text_start_position = 50
try:
new_page.insert_text((50, text_start_position), annot_text, fontsize=10)
Expand All @@ -131,10 +152,11 @@ def add_annotations_as_text_to_pdf(source_document, bytes_stream):
text_start_position += lines_needed * text_line_spacing
new_page_index += 1

processedpagecount = destination_document.page_count
destination_document.save(bytes_stream)
destination_document.insert_pdf(updated_document)

if destination_document:
processedpagecount = destination_document.page_count
destination_document.save(bytes_stream)
destination_document.close()
del destination_document
return processedpagecount
Expand Down

0 comments on commit d0ff432

Please sign in to comment.