-
Notifications
You must be signed in to change notification settings - Fork 96
/
generate_suggestions_from_reference.py
executable file
·56 lines (49 loc) · 1.45 KB
/
generate_suggestions_from_reference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
# Copyright Vespa.ai. All rights reserved.
from bs4 import BeautifulSoup
import sys
import json
import mmh3
def extract_links_from_html(html_file):
with open(html_file, 'r') as file:
soup = BeautifulSoup(file, 'html.parser')
links = []
pre_tags = soup.find_all('pre', class_='pre-config')
for pre_tag in pre_tags:
a_tags = pre_tag.find_all('a')
for a_tag in a_tags:
link_text = a_tag.get_text()
link = a_tag['href']
links.append((link_text, link))
return links
# Example usage:
html_file = sys.argv[1]
url_prefix = sys.argv[2]
namespace = sys.argv[3]
part = html_file.split("/")[-1]
part = part.replace(".html","")
prefix = "/".join(html_file.split("/")[:-1])
result = extract_links_from_html(html_file)
for link_text, link in result:
if not link_text:
continue
if link_text.find("DEPRECATED") > 0:
continue
if link.startswith("#"):
url = url_prefix + html_file + link
else:
url = url_prefix + prefix + '/' + link
id = id = mmh3.hash(url)
term = {
'put': 'id:term-reference:term::%i' % id,
'fields': {
'term': link_text,
'namespace': namespace,
'hash': id,
'type': "reference:" + part,
'url': url,
"corpus_count": 2,
"document_count": 2
}
}
print(json.dumps(term))