-
Notifications
You must be signed in to change notification settings - Fork 65
/
app.py
131 lines (99 loc) · 3.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'''Flask wrapping of the extraction related utilities
'''
import os
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from global_common import ports
from global_common import PREFIX
from extraction import extraction
from splitting import splitting
app = Flask(__name__)
port = int(os.environ.get("PORT", ports["data_extraction"]))
path = os.getcwd()
# Project directories defined As follow:
# -data_dir-: data .
data = os.path.join(path, 'data')
if not os.path.isdir(data):
os.mkdir(data)
# -upload_dir-: contain files uploaded.
uploads = os.path.join(data, 'uploads')
if not os.path.isdir(uploads):
os.mkdir(uploads)
# -preparation_dir-: contain processed & prepared files.
prepare = os.path.join(data, 'files_preparation')
if not os.path.isdir(prepare):
os.mkdir(prepare)
# -output_dir-: contain generated text files.
outputs = os.path.join(data, 'outputs')
if not os.path.isdir(outputs):
os.mkdir(outputs)
# Verify and validate files extensions...
ALLOWED_EXTENSIONS = set(['.pdf'])
def allowed_file(filename):
'''Assess if the file extension is in the allowed listdir
'''
lowercase_extension = os.path.splitext(filename)[1].lower()
return lowercase_extension in ALLOWED_EXTENSIONS
@app.route(PREFIX + '/upload', methods=['POST'])
def upload():
'''Upload files to process
'''
if request.method != 'POST':
resp = jsonify({'message': 'Operation not supported'})
resp.status_code = 500
return resp
# check if the post request has the file part
if 'files[]' not in request.files:
resp = jsonify({'message': 'No file part in the request'})
resp.status_code = 500
return resp
files = request.files.getlist('files[]')
errors = {}
success = False
# check if file allowed or not allowed.
for file in files:
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
file.save(os.path.join(uploads, filename))
success = True
else:
errors[file.filename] = 'File type is not allowed'
if success and errors:
errors['message'] = 'File(s) successfully uploaded'
resp = jsonify(errors)
resp.status_code = 404
return resp
if success:
resp = jsonify({'message': 'Files successfully uploaded'})
resp.status_code = 200
return resp
resp = jsonify(errors)
resp.status_code = 404
return resp
@app.route(PREFIX + '/extraction', methods=['POST'])
def extract_function():
'''Do extract data from files
'''
if request.method == 'POST': # check request method
if not os.listdir(uploads): # if uploads dir is empty return -> error
resp = jsonify({'message': 'Files not found'})
resp.status_code = 500
return resp
try:
# splitting : split docs into single pages.
splitting(uploads, prepare)
# extraction: extract text from pages.
extraction(prepare, outputs)
resp = jsonify({'message': 'Files successfully extracted '})
resp.status_code = 200
return resp
except:
resp = jsonify({'message': 'error occurs while extraction'})
resp.status_code = 404
return resp
else:
resp = jsonify({'message': 'Operation not supported'})
resp.status_code = 500
return resp
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=port)