-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknock82.py
44 lines (35 loc) · 1.19 KB
/
knock82.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import plyvel
import random
from tqdm import tqdm
import json
db = plyvel.DB("./vector1.ldb", create_if_missing=True)
def create_db():
retval={}
with open("corpus.txt", "r") as fp:
for line in tqdm(fp):
words = line.lower().strip().split(" ")
len_words = len(words)
for i, word in enumerate(words):
tmp = retval.get(word,None)
if tmp == None:
tmp = {"d": random.randint(1, 5),"count":1, "c": {}}
d = tmp["d"]
c = tmp["c"]
cs = words[max(0, i - d):i:] + \
words[i + 1:min(i + d, len_words):]
for _c in cs:
c[_c] = c.get(_c, 0) + 1
tmp["count"]=tmp["count"]+1
retval[word]=tmp
return retval
def save_db():
dict=create_db()
for key in tqdm(dict):
db.put(str(key).encode("utf-8"),json.dumps(dict[key]).encode("utf-8"))
del dict
if __name__ == "__main__":
save_db()
for word, value in db:
c = json.loads(value)["c"]
retval = [str(key) for key in c]
print("{0}\t{1}".format(word.decode("utf-8"), ",".join(retval)))