-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathPdfQRSplit.py
154 lines (127 loc) · 5.95 KB
/
PdfQRSplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import argparse
import glob
import sys
import io
import PyPDF4
import zxing
from typing import List
from tempfile import TemporaryDirectory
from PIL import Image
parser = argparse.ArgumentParser(description='Split PDF-file into separate files based on a separator barcode')
parser.add_argument('filename', metavar='inputfile', type=str,
help='Filename or glob to process')
parser.add_argument('-p', '--prefix', default="split",
help='Prefix for generated PDF files. Default: split')
parser.add_argument('-s', '--separator', default="ADAR-NEXTDOC",
help='Barcode content used to find separator pages. Default: ADAR-NEXTDOC')
parser.add_argument('-k', '--keep-page', action='store_true',
help='Keep separator page in previous document')
parser.add_argument('--keep-page-next', action='store_true',
help='Keep separator page in next document')
parser.add_argument('-b', '--brightness', type=int, default=128,
help='brightness threshold for barcode preparation (0-255). Default: 128')
parser.add_argument('-v', '--verbose', action='store_true',
help='Show verbose processing messages')
parser.add_argument('-d', '--debug', action='store_true',
help='Show debug messages')
class PdfQrSplit:
def __init__(self, filepath: str, verbose: bool, debug: bool, brightness: 128) -> None:
self.filepath = filepath
self.verbose = verbose
self.debug = debug
self.brightness = brightness
self.input_pdf = PyPDF4.PdfFileReader(filepath, "rb")
self.total_pages = self.input_pdf.getNumPages()
if verbose:
print(
"Processing file {} containing {} pages".format(
filepath, self.total_pages
)
)
def split_qr(self, split_text: str, ifiles: int) -> int:
"""Creates new files based on barcode contents.
Args:
split_text: Barcode content to recognize a separator page
Returns:
int: Number of generated files.
"""
pdfs_count = 0
current_page = 0
reader = zxing.BarCodeReader()
pdf_writer = PyPDF4.PdfFileWriter()
while current_page != self.total_pages:
if self.verbose:
print(" Analyzing page {}".format((current_page+1)))
page = self.input_pdf.getPage(current_page)
xObject = page['/Resources']['/XObject'].getObject()
with TemporaryDirectory() as temp_dir:
if self.debug:
print(" Writing page images to temporary directory {}".format(temp_dir))
split = False
for obj in xObject:
tgtn=False
if xObject[obj]['/Subtype'] == '/Image':
data = xObject[obj].getData()
if '/FlateDecode' in xObject[obj]['/Filter'] or \
'/DCTDecode' in xObject[obj]['/Filter'] or \
'/JPXDecode' in xObject[obj]['/Filter']:
tgtn = temp_dir + "/" + obj[1:] + ".png"
img = Image.open(io.BytesIO(data))
fn = lambda x : 255 if x > self.brightness else 0
img = img.convert('L').point(fn, mode='1')
img.save(tgtn)
elif self.debug:
print(f" Unknown filter type {xObject[obj]['/Filter']}")
if tgtn:
if self.debug:
print(" Wrote image {}; Checking for separator barcode".format(tgtn))
barcode = reader.decode(tgtn)
if barcode and args.separator in barcode.parsed:
if self.debug:
print(" Found separator barcode")
split = True
if split:
if args.keep_page:
pdf_writer.addPage(page)
output = args.prefix + '_' + str(ifiles) + '_' + str(pdfs_count) + '.pdf'
if self.verbose:
print(" Found separator - writing {} pages to {}".format(pdf_writer.getNumPages(), output))
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
pdf_writer = PyPDF4.PdfFileWriter()
pdfs_count += 1
#Due to a bug in PyPDF4 PdfFileReader breaks when invoking PdfFileWriter.write - reopen file
self.input_pdf = PyPDF4.PdfFileReader(filepath, "rb")
if args.keep_page_next:
pdf_writer.addPage(page)
else:
pdf_writer.addPage(page)
current_page += 1
output = args.prefix + '_' + str(ifiles) + '_' + str(pdfs_count) + '.pdf'
if self.verbose:
print(" End of input - writing {} pages to {}".format(pdf_writer.getNumPages(), output))
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
pdfs_count += 1
return pdfs_count
args = parser.parse_args()
if args.debug:
args.verbose = True
if args.brightness < 0:
args.brightness = 0
if args.brightness > 255:
args.brightness = 255
filepaths = glob.glob(args.filename)
if not filepaths:
sys.exit("Error: no file found, check the documentation for more info.")
ofiles = 0
ifiles = 0
for filepath in filepaths:
splitter = PdfQrSplit(filepath, args.verbose, args.debug, brightness=args.brightness)
ofiles += splitter.split_qr(args.separator, ifiles)
ifiles += 1
print(
"Split {} given files into {} files".format(
ifiles, ofiles
)
)