-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
99 lines (80 loc) · 2.46 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import csv
import json
from os.path import join
import time
import requests
API_BASE_URL = "https://iatiregistry.org/api/action/"
def fetch(path, *args, **kwargs):
def _fetch(*args, **kwargs):
print(args, kwargs)
r = requests.get(*args, **kwargs)
time.sleep(0.1)
r.raise_for_status()
return r
attempts = 5
while True:
try:
r = _fetch(API_BASE_URL + path, *args, **kwargs)
break
except Exception as e:
print("Retrying in 5 seconds")
time.sleep(5)
attempts -= 1
if attempts == 0:
raise e
return r.json()["result"]
def fetch_publishers():
ids = fetch("organization_list")
output = {
"help": "https://registry.codeforiati.org",
"success": True,
"result": [],
}
for id_ in ids:
data = fetch(
"organization_show",
params={
"id": id_,
"show_historical_publisher_names": "true",
})
output["result"].append(data)
with open(join("out", "publisher_list.json"), "w") as fp:
json.dump(output, fp)
return output["result"]
def generate_mappings(publishers):
mappings = {
x["name"]: list(set([y["old_name"] for y in x["historical_publisher_names"]]))
for x in publishers
if x["historical_publisher_names"]
}
with open(join("out", "registry_id_relationships.csv"), "w") as fh:
writer = csv.DictWriter(fh, fieldnames=["current_registry_id", "previous_registry_id"])
writer.writeheader()
for current_name, old_names in mappings.items():
for old_name in old_names:
_ = writer.writerow({
"current_registry_id": current_name,
"previous_registry_id": old_name,
})
def fetch_datasets():
page = 1
page_size = 1000
output = {
"help": "https://registry.codeforiati.org",
"success": True,
"result": [],
}
while True:
start = page_size * (page - 1)
data = fetch(
"package_search",
params={"start": start, "rows": page_size})["results"]
if data == []:
break
output["result"] += data
page += 1
with open(join("out", "dataset_list.json"), "w") as fp:
json.dump(output, fp)
publishers = fetch_publishers()
generate_mappings(publishers)
fetch_datasets()