-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwalkers.py
105 lines (81 loc) · 3.5 KB
/
walkers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from gensim.models import Word2Vec
import csv
import networkx as nx
import random
import pickle
import time
def load_data_from_pickle(pickle_file_path):
"""
Load the data from the pickle file.
Parameters:
pickle_file_path (str): Path to the pickle file containing the paper data.
Returns:
dict: The dictionary mapping paper IDs to their citations and keywords.
"""
with open(pickle_file_path, 'rb') as f:
data = pickle.load(f)
return data
def build_citation_graph(paper_data):
"""
Build a citation graph using the paper data.
Parameters:
paper_data (dict): Dictionary containing paper IDs, citations, and keywords.
Returns:
G (nx.DiGraph): A directed graph where nodes are papers and edges represent citations.
"""
G = nx.DiGraph() # Directed graph since citations are directional
for paper_id, details in paper_data.items():
# Add the paper itself as a node
G.add_node(paper_id, keywords=details['keywords'])
# Add citation edges (directed)
for citation_id in details['citations']:
if citation_id in paper_data: # Only add citations that exist in the dataset
G.add_edge(paper_id, citation_id) # Edge from paper_id -> citation_id
return G
def generate_and_save_walks(G, num_walks, walk_length, output_file):
"""
Generate random walks on a graph and save them to a CSV file.
Parameters:
G (networkx.Graph): The input graph.
num_walks (int): Number of random walks to start from each node.
walk_length (int): Length of each random walk.
output_file (str): Path to the CSV file where walks will be saved.
"""
nodes = list(G.nodes())
with open(output_file, "a", newline="") as f:
writer = csv.writer(f, delimiter=';') # Set delimiter to semicolon
for node in nodes:
if not list(G.neighbors(node)):
continue
for _ in range(num_walks):
walk = [node]
while len(walk) < walk_length:
current_node = walk[-1]
neighbors = list(G.neighbors(current_node))
if not neighbors:
break # End walk if no neighbors are found
next_node = random.choice(neighbors)
walk.append(next_node)
writer.writerow(walk)
# Example usage
# generate_and_save_walks(G, num_walks=10, walk_length=30, output_file="random_walks.csv")
if __name__ == '__main__':
start_time = time.time()
# Path to your pickle file
pickle_file_path = 'D:\\Projects\\3Credit_project\\Aminer Dataset\\paper_data.pkl'
print(f"Starting pickle load")
# Load and build the graph
paper_data = load_data_from_pickle(pickle_file_path)
print(f"starting graph build")
G = build_citation_graph(paper_data)
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
pre_write = time.time()
print(f"load time = {time.time()-start_time } seconds")
# Generate and save walks to a CSV file
print(f"starting to walk")
generate_and_save_walks(G, num_walks=10, walk_length=10, output_file="walks.csv")
print("walks generated and saved")
print(f"time taken = {time.time()-pre_write} seconds")
# Delete the graph to free up memory
del G
print(f"total time taken = {time.time() - start_time} seconds")