-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathwordSim.py
executable file
·85 lines (60 loc) · 2.65 KB
/
wordSim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from config import *
################################################################################
def loadPPDB(ppdbFileName = 'Resources/ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'):
global ppdbSim
global ppdbDict
count = 0
ppdbFile = open(ppdbFileName, 'r')
for line in ppdbFile:
if line == '\n':
continue
tokens = line.split()
tokens[1] = tokens[1].strip()
ppdbDict[(tokens[0], tokens[1])] = ppdbSim
count += 1
################################################################################
################################################################################
def presentInPPDB(word1, word2):
global ppdbDict
if (word1.lower(), word2.lower()) in ppdbDict:
return True
if (word2.lower(), word1.lower()) in ppdbDict:
return True
################################################################################
##############################################################################################################################
def wordRelatedness(word1, pos1, word2, pos2):
global stemmer
global ppdbSim
global punctuations
if len(word1) > 1:
canonicalWord1 = word1.replace('.', '')
canonicalWord1 = canonicalWord1.replace('-', '')
canonicalWord1 = canonicalWord1.replace(',', '')
else:
canonicalWord1 = word1
if len(word2) > 1:
canonicalWord2 = word2.replace('.', '')
canonicalWord2 = canonicalWord2.replace('-', '')
canonicalWord2 = canonicalWord2.replace(',', '')
else:
canonicalWord2 = word2
if canonicalWord1.lower() == canonicalWord2.lower():
return 1
if stemmer.stem(word1).lower() == stemmer.stem(word2).lower():
return 1
if canonicalWord1.isdigit() and canonicalWord2.isdigit() and canonicalWord1 <> canonicalWord2:
return 0
if pos1.lower() == 'cd' and pos2.lower() == 'cd' and (not canonicalWord1.isdigit() and not canonicalWord2.isdigit()) and canonicalWord1 <> canonicalWord2:
return 0
# stopwords can be similar to only stopwords
if (word1.lower() in stopwords and word2.lower() not in stopwords) or (word1.lower() not in stopwords and word2.lower() in stopwords):
return 0
# punctuations can only be either identical or totally dissimilar
if word1 in punctuations or word2 in punctuations:
return 0
if presentInPPDB(word1.lower(), word2.lower()):
return ppdbSim
else:
return 0
##############################################################################################################################
loadPPDB()