-
Notifications
You must be signed in to change notification settings - Fork 0
/
words.py
99 lines (89 loc) · 2.67 KB
/
words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import re
import string
def filelist(root):
"""Return a fully-qualified list of filenames under root directory"""
names = []
for r, dirs, files in os.walk(root, topdown=False):
for i in range(len(files)):
names.append(r + "/" + files[i])
return names[:-1]
def get_text(fileName):
f = open(fileName, encoding="latin-1")
s = f.read()
f.close()
return s
def words(text):
"""
Given a string, return a list of words normalized as follows.
Split the string to make words first by using regex compile() function
and string.punctuation + '0-9\\r\\t\\n]' to replace all those
char with a space character.
Split on space to get word list.
Ignore words < 3 char long.
Lowercase all words
"""
regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
nopunct = regex.sub(
" ", text
) # delete stuff but leave at least a space to avoid clumping together
words = nopunct.split(" ")
words = [w for w in words if len(w) > 2] # ignore a, an, to, at, be, ...
words = [w.lower() for w in words]
# print words
return words
def results(docs, terms):
"""
Given a list of fully-qualifed filenames, return an HTML file
that displays the results and up to 2 lines from the file
that have at least one of the search terms.
Return at most 100 results. Arg terms is a list of string terms.
"""
ndocs = len(docs)
docs = docs[0:100] # at most 100 results
terms = set(terms)
body = ""
for f in docs:
lines = get_text(f).split("\n")
summary = ""
n = 0
for line in lines:
if n > 2:
break # show 2 lines at most
line_words = words(line)
if set(line_words).intersection(terms):
for w in line_words:
summary += " "
if w in terms:
# summary += f"<font color=#41b6c4>{w}</font>"
summary += f"<b>{w}</b>"
else:
summary += w
summary += "<br>"
n += 1
result = """
<p><a href="file://%s">%s</a><br>
%s<br>
""" % (
f,
f,
summary,
)
body += result
return """
<html>
<body>
<h2>Search results for <b>%s</b> in %d files</h2>
%s
</body>
</html>
""" % (
" ".join(terms),
ndocs,
body,
)
def filenames(docs):
"""Return just the filenames from list of fully-qualified filenames"""
if docs is None:
return []
return [os.path.basename(d) for d in docs]