-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_titles.py
executable file
·58 lines (41 loc) · 2.08 KB
/
check_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/python
## -*- coding: utf-8 -*-
import os, re, json, codecs
from config import *
from lxml import etree
if __name__ == "__main__":
lexicon = json.loads(codecs.open(PATH_TO_LEXICON, 'r', encoding='utf-8').read())
all_unchanged_words = []
for file_name in os.listdir(PATH_TO_TEXT_OUTPUTS):
if file_name.find('_summary.xml') > -1:
#print 'PROCESSING', file_name
tree = etree.parse(PATH_TO_TEXT_OUTPUTS + file_name)
root = tree.getroot()
old_title = re.sub('\s+', ' ', re.sub('[^A-Za-z0-9]', ' ', root.get('title')).strip())
new_title = re.sub('\s+', ' ', re.sub('[^A-Za-z0-9]', ' ', root.get('print_title').replace('\\n', '\n')).strip())
#print old_title, new_title
if old_title == new_title:
print 'ERROR' + '\t' + file_name + '\t' + old_title + '\t' + new_title
else:
old_title_words = re.split('[^A-Za-z0-9]', old_title.lower())
new_title_words = re.split('[^A-Za-z0-9]', new_title.lower())
unchanged_words = []
for w in old_title_words:
if w in new_title_words:
pos_list = ['UNK',]
try:
pos_list = lexicon[w]
except KeyError:
pass
word_is_error = False
if len(pos_list) == 1 and pos_list[0] in ['nn', 'nns', 'nnp', 'nnps', 'jj', 'jjr']:
word_is_error = True
if len(pos_list) == 1 and pos_list[0].startswith('vb') == True:
word_is_error = True
if word_is_error == True:
unchanged_words.append(w)
all_unchanged_words.append(w + '_' + str(pos_list))
if len(unchanged_words) > 0:
print 'ERROR' + '\t' + file_name + '\t' + old_title + '\t' + new_title
#print
#print sorted(list(set(all_unchanged_words)))