-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_pdf.py
30 lines (23 loc) · 976 Bytes
/
preprocess_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from pdf2image import convert_from_path
from PIL import Image
import os
print(os.getcwd())
def split_scanned_book_pages(pdf_path, output_folder):
# Convert PDF to a list of images
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
# Calculate the width and height of an image
width, height = image.size
# The x-coordinate of the split is at half the width
x_coordinate = width // 2
# Split the image into two halves
left_page = image.crop((0, 0, x_coordinate, height))
right_page = image.crop((x_coordinate, 0, width, height))
# Save each page as an image
if i != 0:
left_page.save(f'{output_folder}/page_{2*i+1-1}.jpg')
right_page.save(f'{output_folder}/page_{2*i+2-1}.jpg')
# Specify the path to your PDF file and the output folder
pdf_path = './Padilla.pdf'
output_folder = './output_pages'
split_scanned_book_pages(pdf_path, output_folder)