-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFiles.py
71 lines (55 loc) · 2.18 KB
/
Files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
import os
from multiprocessing import Pool
def readFromFiles(eachFile):
data = {}
with open(eachFile) as file:
contentFromFile = {}
i = 0
page = 0
num_lines = sum(1 for line in open(eachFile))
temp = " "
while page != num_lines:
temp = file.readline().strip()
wordInfo = temp.strip().split()
for each in range(len(wordInfo)):
wordInfo[each] = wordInfo[each].replace(".", "")
wordInfo[each] = wordInfo[each].replace("*", "")
wordInfo[each] = wordInfo[each].replace(",", "")
wordInfo[each] = wordInfo[each].replace("!", "")
wordInfo[each] = wordInfo[each].replace(" ", "")
wordInfo[each] = wordInfo[each].replace(";", "")
wordInfo[each] = wordInfo[each].replace(":", "")
wordInfo[each] = wordInfo[each].replace("?", "")
wordInfo[each] = wordInfo[each].lower()
if wordInfo[each] not in contentFromFile:
contentFromFile[wordInfo[each]] = {
"File": {eachFile: {"Line": [i]}}}
else:
contentFromFile[wordInfo[each]
]["File"][eachFile]["Line"].append(i)
i += 1
page += 1
return contentFromFile
def multiprocessingIndexWordsAndFiles():
pool = Pool()
fileList = []
for eachFile in os.listdir():
if eachFile.endswith(".txt"):
fileList.append(eachFile)
result_async = [pool.apply_async(
readFromFiles, args=(x, )) for x in fileList]
results = [r.get() for r in result_async]
pool = Pool()
pool = Pool(processes=len(os.listdir()))
data = {}
with open('data.json', 'w') as fp:
# pickle.dump(data, fp)
for contentFromFile in results:
for key, value in contentFromFile.items():
if data.get(key, None):
data[key]["File"].update(contentFromFile[key]["File"])
else:
data[key] = contentFromFile[key]
json.dump(data, fp, indent=4)
print("Indexed")