-
Notifications
You must be signed in to change notification settings - Fork 7
/
ocr.py
74 lines (60 loc) · 2.62 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
# imagemagik
from wand.image import Image
from wand.color import Color
# default image module
try:
from PIL import Image as P_image
except ImportError:
import Image as P_image
# ocr engine
import pytesseract
# opencv
import cv2
def ocr_process(filename, resolution=450, page_seg_method='3'):
""" This function converts a PDF into images,
preprocess them using the OpenCV module (unnecessary for formal invoice),
and then feed them into Tesseract Ocr engine (a popular free OCR solution).
The resolution parameter will be feed into OCR engine, so it is directly linked to the running speed.
"""
# Create an empty string variable
txt = ""
# Using imagemagik module to load PDF pages and convert them into images
all_pages = Image(filename=filename, resolution=resolution)
for i, page in enumerate(all_pages.sequence):
with Image(page) as img:
img.format = 'png'
img.background_color = Color('white')
img.alpha_channel = 'remove'
image_filename = os.path.splitext(os.path.basename(filename))[0]
image_filename = '{}-{}.png'.format(image_filename, i)
path_filename = os.path.join('converted_image', image_filename)
# create a file in the current directory
try:
os.mkdir('converted_image')
except:
pass
# save the image to the file
img.save(filename=path_filename) # save it to the output path
# turn the image to black and white
im = cv2.imread(path_filename)
im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
# Unnecessary for formal invoice
# # 2. 用adaptive threshold对图像进行二值化处理
# im_inv = cv2.adaptiveThreshold(im_gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,9,2)
# # 3. 进行降噪处理
# kernel = 1/16*np.array([[1,2,1],[2,4,2],[1,2,1]])
# im_blur = cv2.filter2D(im_inv, -1, kernel)
# create another file to save the image
try:
os.mkdir('preprocessed_image')
except:
pass
# save the image to preprocessed_image
path_filename2 = os.path.join('preprocessed_image', image_filename)
cv2.imwrite(path_filename2, im_gray)
# Run the Tesseract OCR engine on each image and return the strings
txt = "".join([txt, pytesseract.image_to_string(
P_image.open(path_filename2), lang="eng",
config='--psm ' + page_seg_method)])
return txt