This repository has been archived by the owner on Aug 12, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig.py
80 lines (63 loc) · 2.21 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# citation styles
STYLES = [
'acm-sig-proceedings',
'american-chemical-society',
'american-chemical-society-with-titles',
'american-institute-of-physics',
'american-sociological-association',
'apa',
'bmc-bioinformatics',
'chicago-author-date',
'elsevier-without-titles',
'elsevier-with-titles',
'harvard3',
'ieee',
'iso690-author-date-en',
'modern-language-association',
'springer-basic-author-date',
'springer-lecture-notes-in-computer-science',
'vancouver']
# data cleaning settings
MIN_REF_LEN = 11
REGEX_REMOVE = ['Retrieved$', 'Available from:$', '\[Internet\]',
'\[online\]', 'doi:$']
REGEX_RANDOM_REMOVE = ['^\(1\)', '^\[1\]', '^1\.', '^1(?!\d)', '\.$']
YEAR_PATTERN = '(?:16|17|18|19|20)\d{2}'
MONTH_ABBR_PATTERN = '(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
MONTH_PATTERN = '(?:January|February|March|April|May|June|July|August|' + \
'September|October|November|December)'
REGEX_MONTH_REMOVE = {
r'\('+MONTH_ABBR_PATTERN+'\. ('+YEAR_PATTERN+')\)': r'(\1)',
r'(?<![a-zA-Z])'+MONTH_ABBR_PATTERN+'\. ('+YEAR_PATTERN+')\.': r'\1.',
r'(?<!\d)('+YEAR_PATTERN+') '+MONTH_ABBR_PATTERN+';': r'\1;',
r'\('+MONTH_PATTERN+'\)': '',
r'\('+MONTH_PATTERN+' [123]?\d\)': '',
r'(?<![a-zA-Z])'+MONTH_PATTERN+' ('+YEAR_PATTERN+')\.': r'\1.',
r'(?<!\d)[123]?\d\ '+MONTH_PATTERN+' ('+YEAR_PATTERN+')\.': r'\1.'}
# feature token mapping
REGEX_WORD_TO_TOKEN = {
'(?<![a-zA-Z])[a-z]{2,}(?![a-zA-Z])': 'lcword',
'(?<![a-zA-Z])[a-z](?![a-zA-Z])': 'lclett',
'(?<![a-zA-Z])[A-Z]{2,}(?![a-zA-Z])': 'ucword',
'(?<![a-zA-Z])[A-Z](?![a-zA-Z])': 'uclett',
'(?<![a-zA-Z])[A-Z][a-z]+(?![a-zA-Z])': 'capword',
'[A-Za-z]*[A-Z][A-Za-z]*': 'word',
'(?<!\d)'+YEAR_PATTERN+'(?!\d)': 'year',
'(?<!\d)\d+(?!\d)': 'num',
'\.': 'dot',
',': 'comma',
'\(': 'lpar',
'\)': 'rpar',
'\[': 'lbracket',
'\]': 'rbracket',
':': 'colon',
';': 'semicolon',
'/': 'slash',
'&': 'and',
'[\u002D\u00AD\u2010\u2011\u2012\u2013\u2014\u2015\u207B' +
'\u208B\u2212-]': 'dash',
'[“”]': 'quot',
'[^a-z ]+': 'other'}
# training settings
N_FEATURES = 5000
NGRAM_RANGE = (2, 4)