-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
218 lines (181 loc) · 7.63 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import streamlit as st
import io
import zipfile
import numpy as np
import cv2
from PIL import Image
from utils.image_processing import preprocess_image
from utils.pdf_processing import process_pdf
from utils.text_extraction import extract_text
def setup_page_config():
"""Configure Streamlit page settings."""
st.set_page_config(
page_title="Text Extraction Using Pytesseract",
page_icon=":page_facing_up:",
layout="wide"
)
def initialize_session_state():
"""Initialize or reset session state variables."""
if 'uploaded_files' not in st.session_state:
st.session_state.uploaded_files = None
if 'all_text' not in st.session_state:
st.session_state.all_text = []
if 'individual_texts' not in st.session_state:
st.session_state.individual_texts = {}
def create_sidebar_options():
"""Create user-friendly OCR processing options."""
st.sidebar.header("Image Enhancement Options")
return {
'apply_threshold': st.sidebar.checkbox(
"Sharpen Text",
value=True,
help="Improves text clarity by increasing contrast between text and background. Helps with faded or low-quality scans."
),
'apply_deskew': st.sidebar.checkbox(
"Straighten Document",
value=True,
help="Corrects tilted or skewed documents. Fixes images where text is not perfectly horizontal, making text easier to read."
),
'apply_denoise': st.sidebar.checkbox(
"Remove Background Noise",
value=True,
help="Removes specks, graininess, and background interference. Makes text clearer in scanned documents with imperfect backgrounds."
),
'apply_contrast': st.sidebar.checkbox(
"Enhance Text Visibility",
value=False,
help="Boosts text brightness and contrast. Useful for documents with poor lighting or faded print."
),
'psm': st.sidebar.selectbox(
"Text Layout Detection",
options=[3, 4, 6, 11, 12],
format_func=lambda x: {
3: "Automatic Detection",
4: "Single Column Layout",
6: "Single Text Block",
11: "Line by Line",
12: "Word by Word"
}[x],
help="Choose how the system should read your document's layout. Automatic[3] is best for most documents."
)
}
def display_processed_image(original_image, processed_image):
"""
Display original and processed images side by side
Args:
original_image (numpy.ndarray): Original input image
processed_image (numpy.ndarray): Preprocessed image
"""
# Create two columns for display
col1, col2 = st.columns(2)
with col1:
st.subheader("Original Image")
# Convert OpenCV image (BGR) to RGB for correct color display
st.image(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB),
use_column_width=True)
with col2:
st.subheader("Processed Image")
st.image(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB),
use_column_width=True)
def process_uploaded_files(uploaded_files, options):
"""
Modified function to show processed images
Args:
uploaded_files (list): List of uploaded files
options (dict): OCR processing options
Returns:
tuple: Lists of all text and individual texts
"""
all_text = []
individual_texts = {}
# Progress bar for multiple file processing
progress_bar = st.progress(0)
for i, uploaded_file in enumerate(uploaded_files):
try:
# Update progress bar
progress_bar.progress((i + 1) / len(uploaded_files))
if uploaded_file.type == "application/pdf":
# Handle PDF processing
text = process_pdf(uploaded_file, options)
st.warning("PDF preview not supported. Text extracted.")
else:
# Image processing
image = Image.open(uploaded_file)
image_np = np.array(image)
# Show original image before processing
st.subheader(f"Processing: {uploaded_file.name}")
# Display original image
st.image(image, caption="Original Image", use_column_width=True)
# Preprocess image
processed_image = preprocess_image(image_np, options)
# Display original and processed images side by side
display_processed_image(image_np, processed_image)
# Extract text
text = extract_text(processed_image, options)
all_text.append(f"File: {uploaded_file.name}\n\n{text}\n\n{'='*50}\n")
individual_texts[uploaded_file.name] = text
except Exception as e:
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
# Clear progress bar
progress_bar.empty()
return all_text, individual_texts
def create_text_downloads(all_text, individual_texts):
"""
Create download buttons for extracted texts.
Args:
all_text (list): Combined extracted texts
individual_texts (dict): Individual file texts
"""
# Combined text download
combined_text = "\n".join(all_text)
combined_text_io = io.BytesIO(combined_text.encode('utf-8'))
st.download_button(
label="Download Combined Extracted Text",
data=combined_text_io,
file_name="combined_extracted_text.txt",
mime="text/plain"
)
# Individual texts download
if individual_texts:
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED, False) as zip_file:
for file_name, text in individual_texts.items():
zip_file.writestr(f"{file_name}_extracted.txt", text)
st.download_button(
label="Download Individual Extracted Texts",
data=zip_buffer.getvalue(),
file_name="individual_extracted_texts.zip",
mime="application/zip"
)
def main():
"""Main Streamlit application function."""
# Setup page configuration
setup_page_config()
# Initialize session state
initialize_session_state()
# App title and description
st.title("Text Extraction using Tesseract OCR")
st.markdown('## Upload multiple images or PDF files to extract text from.')
st.write('From the list of Tesseract Page Segmentation Modes (PSM) on the left,\n you control how Tesseract analyzes and interprets document with varying layouts:')
st.write(""" Automatic detection works fine for most documents,\n
You can also Choose a different one based on your document's structure from the list.\n""")
# File uploader
uploaded_files = st.file_uploader(
"Choose files",
accept_multiple_files=True,
type=["png", "jpg", "jpeg", "pdf"]
)
# Create OCR options
options = create_sidebar_options()
# Process files when uploaded
if uploaded_files:
# Process uploaded files
all_text, individual_texts = process_uploaded_files(uploaded_files, options)
# Display extracted text
if all_text:
st.text_area("Extracted Text", value="\n".join(all_text), height=300)
# Create download buttons
create_text_downloads(all_text, individual_texts)
# This ensures the app runs automatically when accessed
if __name__ == "__main__":
main()