This repository has been archived by the owner on Sep 20, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
multi-collections.py
105 lines (80 loc) · 4.72 KB
/
multi-collections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
""" module is used to track if there are datasets
belonging to multiple collections.
this identification is done using the graph model files which were
generated during the scraping and transforming processes.
NOTE: this module uses the ED_OUTPUT_PATH environment
varialbe same as the edscrapers toolkit """
import os
import pathlib
import pandas as pd
from edscrapers.scrapers.base.graph import GraphWrapper
OUTPUT_PATH = os.getenv("ED_OUTPUT_PATH")
def get_all_collection_graph_output_file(dir_path=OUTPUT_PATH) -> list:
""" function globs through the specified `dir_path` and collects the
path to each collection graph output i.e.
files that end with {name}.collection.pickle """
if str(dir_path) == OUTPUT_PATH:
# get the directory where all graph files are stored
graph_dir_path = pathlib.Path(dir_path, 'graphs')
else:
graph_dir_path = pathlib.Path(dir_path)
if not graph_dir_path.is_dir():
raise TypeError("expected a directory path")
offices_dir_list = [] # holds the directory for every office that has a graph model
# iterate through the graph directory
for dir_child in graph_dir_path.iterdir():
if dir_child.is_dir():
offices_dir_list.append(dir_child)
collections_graph_files = [] # holds the collections graph for every office available
# iterate through the list of office directories and get the collection graph file
for office in offices_dir_list:
collections_graph_files.extend(office.glob('*.collections.pickle'))
return collections_graph_files
def identify_datasets_with_multi_collections(graph_file_path):
""" function identify datasets with multiple collections """
if isinstance(graph_file_path, (str, pathlib.Path)):
# load the graph from the filepath provided
GraphWrapper.load_graph(file_dir_path=pathlib.Path(graph_file_path).parent,
file_stem_name=pathlib.Path(graph_file_path).stem)
# get the loaded graph object
graph = GraphWrapper.get_graph()
with graph.graph_lock:
# select datasets vertices that are in multiple collections
dataset_ver_seq = graph.vs.select(is_dataset_eq=True, name_ne='base_vertex').\
select(lambda vertex: 'in_collection' in vertex.attribute_names() and vertex['in_collection'] is not None and len(vertex['in_collection']) > 1)
# get the name of the office this graph belongs to
office_name = pathlib.Path(graph_file_path).stem.split('.')[0]
dataset_ver_seq['office_name'] = office_name
# info user that there are datasets with multiple collections
print(f'There are {len(dataset_ver_seq)} datasets with links to multiple Collections within the {office_name.upper()} office')
return dataset_ver_seq
else:
raise TypeError("Invalid 'graph_file_path' specified")
def output_to_csv(graph_vertex_seq, graph_office_name):
""" function dumps the identified dataset vertices that
have multiple collections to a csv"""
if len(graph_vertex_seq) == 0: #the sequence is empty so set some defaults
graph_vertex_seq['collection_urls'] = ''
graph_vertex_seq['collection_names'] = ''
graph_vertex_seq['num_of_collection'] = 0
for vertex in graph_vertex_seq:
vertex['collection_urls'] = "\n".join([collection['collection_url'] for collection in vertex['in_collection']])
vertex['collection_names'] = "\n".join([collection['collection_title'] for collection in vertex['in_collection']])
vertex['num_of_collection'] = len(vertex['in_collection'])
# convert the vertex sequence to a panda frame
df = pd.DataFrame(columns=['Dataset URL', 'Dataset Office', 'Number of Collections Linked To', 'Collection URLs'])
df['Dataset URL'] = graph_vertex_seq['name']
df['Dataset Office'] = graph_vertex_seq['office_name']
df['Number of Collections Linked To'] = graph_vertex_seq['num_of_collection']
df['Collection URLs'] = graph_vertex_seq['collection_urls']
df.to_csv(pathlib.Path(OUTPUT_PATH, f'{graph_office_name}_dataset_multi_collection.csv'),
columns=['Dataset URL', 'Dataset Office', 'Number of Collections Linked To', 'Collection URLs'],
header=True, index=False)
if __name__ == "__main__":
graph_file_list = get_all_collection_graph_output_file()
# loop through the graph file paths provided
for graph_file in graph_file_list:
name_of_office = pathlib.Path(graph_file).name.split('.')[-3]
v_sequence = identify_datasets_with_multi_collections(graph_file)
output_to_csv(graph_vertex_seq=v_sequence,
graph_office_name=name_of_office)