-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikidata_connector.py
71 lines (65 loc) · 3.03 KB
/
wikidata_connector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pickle
from pathlib import Path
import requests
import unidecode as unidecode
class WikidataConnector:
def __init__(self):
self.queries = {"actors": """SELECT DISTINCT ?actorLabel WHERE {
?actor wdt:P31 wd:Q5;
wdt:P106 wd:Q10800557;
wdt:P21 wd:Q6581097.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} ORDER BY ?actorLabel""",
"actresses": """SELECT DISTINCT ?actorLabel WHERE {
?actor wdt:P31 wd:Q5;
wdt:P106 wd:Q10800557;
wdt:P21 wd:Q6581072.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} ORDER BY ?actorLabel""",
"films": """SELECT DISTINCT ?filmLabel WHERE {{
?film wdt:P31 wd:Q11424;
wdt:P345 ?id;
wdt:P577 ?date.
FILTER (?date > "{0}-01-01"^^xsd:dateTime && ?date < "{1}-12-31"^^xsd:dateTime)
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
}} ORDER BY ?filmLabel""",
"directors": """SELECT DISTINCT ?directorLabel WHERE {
?director wdt:P31 wd:Q5;
wdt:P106 wd:Q2526255.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} ORDER BY ?directorLabel""",
"series": """SELECT DISTINCT ?seriesLabel WHERE {
?series wdt:P31 wd:Q5398426;
wdt:P345 ?id;
wdt:P580 ?date.
FILTER (?date > "2000-01-01"^^xsd:dateTime && ?date < NOW())
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} ORDER BY ?seriesLabel"""}
self.results = {}
def call_wikidate(self, query, field_name, year1="", year2=""):
if not str(query + year2) in self.results.keys():
file = Path("wikidata_" + query + ".txt")
if file.exists():
with open("wikidata_" + query + year2 + ".txt", 'rb') as f:
self.results[str(query + year2)] = pickle.load(f)
else:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
if year2 == "":
json = requests.get(url, params={'query': self.queries[query], 'format': 'json'}).json()
else:
json = requests.get(url, params={'query': self.queries[query].format(year1, year2),
'format': 'json'}).json()
self.results[str(query + year2)] = self.parse_json(json, field_name)
with open("wikidata_" + query + year2 + ".txt", 'wb') as f:
pickle.dump(self.results[str(query + year2)], f)
return self.results[str(query + year2)]
def parse_json(self, json, field_name):
entities = []
for item in json['results']['bindings']:
name = item[field_name]['value']
name = name.replace('-', ' ')
name = name.replace(' of ', ' ')
name = unidecode.unidecode(name)
entities.append(name)
entities.sort(key=len, reverse=True)
return entities