From 5f5495f90561bbe779d72692d21bf4940bed62db Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 08:47:40 +0200 Subject: [PATCH 1/7] update cora add PuMed and CiteSeer --- Project.toml | 4 +- README.md | 4 +- docs/src/datasets/CiteSeer.md | 11 +++++ docs/src/datasets/Cora.md | 2 +- docs/src/datasets/PubMed.md | 11 +++++ src/CiteSeer/CiteSeer.jl | 79 +++++++++++++++++++++++++++++++ src/Cora/Cora.jl | 88 +++++++++++++++++++++++------------ src/MLDatasets.jl | 20 +++++++- src/PubMed/PubMed.jl | 79 +++++++++++++++++++++++++++++++ src/UD_English/UD_English.jl | 2 +- src/planetoid.jl | 86 ++++++++++++++++++++++++++++++++++ test/runtests.jl | 4 +- test/tst_citeseer.jl | 19 ++++++++ test/tst_cora.jl | 22 +++++---- test/tst_pubmed.jl | 21 +++++++++ 15 files changed, 407 insertions(+), 45 deletions(-) create mode 100644 docs/src/datasets/CiteSeer.md create mode 100644 docs/src/datasets/PubMed.md create mode 100644 src/CiteSeer/CiteSeer.jl create mode 100644 src/PubMed/PubMed.jl create mode 100644 src/planetoid.jl create mode 100644 test/tst_citeseer.jl create mode 100644 test/tst_pubmed.jl diff --git a/Project.toml b/Project.toml index 6795323b..e316d0ba 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "MLDatasets" uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458" -version = "0.5.8" +version = "0.5.9" [deps] BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee" @@ -10,6 +10,7 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" MAT = "23992714-dd62-5051-b70f-ba57cb901cac" +PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" Requires = "ae029012-a4dd-5104-9daa-d747884805df" [compat] @@ -20,6 +21,7 @@ FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8" GZip = "0.5" ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8" MAT = "0.7, 0.8, 0.9, 0.10" +PyCall = "1" Requires = "1" julia = "1" diff --git a/README.md b/README.md index f99682e4..bd9b7028 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Each dataset has its own dedicated sub-module. Find below a list of available datasets and links to their documentation. #### Vision - - [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/) + - [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR10/) - [CIFAR100](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/) - [EMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/EMNIST/) - [FashionMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/FashionMNIST/) @@ -38,7 +38,9 @@ Find below a list of available datasets and links to their documentation. - [UD_English](https://juliaml.github.io/MLDatasets.jl/latest/datasets/UD_English/) #### Graphs + - [CiteSeer](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CiteSeer/) - [Cora](https://juliaml.github.io/MLDatasets.jl/latest/datasets/Cora/) + - [PubMed](https://juliaml.github.io/MLDatasets.jl/latest/datasets/PubMed/) diff --git a/docs/src/datasets/CiteSeer.md b/docs/src/datasets/CiteSeer.md new file mode 100644 index 00000000..76821eb7 --- /dev/null +++ b/docs/src/datasets/CiteSeer.md @@ -0,0 +1,11 @@ +# CiteSeer + +```@docs +CiteSeer +``` + +## API reference + +```@docs +CiteSeer.dataset +``` diff --git a/docs/src/datasets/Cora.md b/docs/src/datasets/Cora.md index ad93a062..c6188874 100644 --- a/docs/src/datasets/Cora.md +++ b/docs/src/datasets/Cora.md @@ -7,5 +7,5 @@ Cora ## API reference ```@docs -Cora.alldata +Cora.dataset ``` diff --git a/docs/src/datasets/PubMed.md b/docs/src/datasets/PubMed.md new file mode 100644 index 00000000..8c042c0b --- /dev/null +++ b/docs/src/datasets/PubMed.md @@ -0,0 +1,11 @@ +# PubMed + +```@docs +PubMed +``` + +## API reference + +```@docs +PubMed.dataset +``` diff --git a/src/CiteSeer/CiteSeer.jl b/src/CiteSeer/CiteSeer.jl new file mode 100644 index 00000000..9c2596f3 --- /dev/null +++ b/src/CiteSeer/CiteSeer.jl @@ -0,0 +1,79 @@ +export CiteSeer + + +""" + CiteSeer + +The CiteSeer citation network dataset from Ref. [1]. +Nodes represent documents and edges represent citation links. +The dataset is designed for the node classification task. +The task is to predict the category of certain paper. +The dataset is retrieved from Ref. [2]. + +## Interface + +- [`CiteSeer.dataset`](@ref) + +## References + +[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815) +[2]: [Planetoid](https://github.com/kimiyoung/planetoid) +""" +module CiteSeer + +using DataDeps +using ..MLDatasets: datafile, read_planetoid_data +using DelimitedFiles: readdlm + +using PyCall + +const DEPNAME = "CiteSeer" +const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data" +const DOCS = "https://github.com/kimiyoung/planetoid" +const DATA = "ind.citeseer." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"] + +function __init__() + register(DataDep( + DEPNAME, + """ + Dataset: The $DEPNAME dataset. + Website: $DOCS + """, + joinpath.(LINK, DATA), + "7f7ec4df97215c573eee316de35754d89382011dfd9fb2b954a4a491057e3eb3", # if checksum omitted, will be generated by DataDeps + # post_fetch_method = unpack + )) +end + +""" + dataset(; dir=nothing, reverse_edges=true) + +Retrieve the CiteSeer dataset. The output is a named tuple with fields +```juliarepl +julia> keys(CiteSeer.dataset()) +(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed) +``` + +In particular, `adjacency_list` is a vector of vector, +where `adjacency_list[i]` will contain the neighbors of node `i` +through outgoing edges. + +If `reverse_edges=true`, the graph will contain +the reverse of each edge and the graph will be undirected. + +See also [`CiteSeer`](@ref). + +## Usage Examples + +```julia +using MLDatasets: CiteSeer +data = CiteSeer.dataset() +train_labels = data.node_labels[data.train_indices] +``` +""" +dataset(; dir=nothing, reverse_edges=true) = + read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges) + + +end #module + diff --git a/src/Cora/Cora.jl b/src/Cora/Cora.jl index f7926788..2a46050e 100644 --- a/src/Cora/Cora.jl +++ b/src/Cora/Cora.jl @@ -1,27 +1,52 @@ export Cora + """ Cora -The full Cora citation network dataset from the -`"Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via -Ranking" `_ paper. +The Cora citation network dataset from Ref. [1]. Nodes represent documents and edges represent citation links. +Each node has a predefined feature with 1433 dimensions. +The dataset is designed for the node classification task. +The task is to predict the category of certain paper. +The dataset is retrieved from Ref. [2]. + +## Statistics + +- Nodes: 2708 +- Edges: 10556 +- Number of Classes: 7 +- Label split: + - Train: 140 + - Val: 500 + - Test: 1000 + +The split is the one used in the original paper [1] and +doesn't consider all nodes. ## Interface -- [`Cora.alldata`](@ref) +- [`Cora.dataset`](@ref) + +## References + +[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815) +[2]: [Planetoid](https://github.com/kimiyoung/planetoid """ module Cora using DataDeps -using ..MLDatasets: datafile +using ..MLDatasets: datafile, read_planetoid_data using DelimitedFiles: readdlm -const DEPNAME = "Cora" -const LINK = "http://nrvis.com/download/data/labeled/cora.zip" -const DOCS = "http://networkrepository.com/cora.php" +using PyCall +const DEPNAME = "Cora" +# LINK = "https://github.com/shchur/gnn-benchmark/raw/master/data/npz" +# LINK = "https://github.com/abojchevski/graph2gauss/raw/master/data/" +const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data" +const DOCS = "https://github.com/kimiyoung/planetoid" +const DATA = "ind.cora." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"] function __init__() register(DataDep( @@ -30,40 +55,41 @@ function __init__() Dataset: The $DEPNAME dataset. Website: $DOCS """, - LINK, - "a3e3a37c34c9385fe8089bbc7c17ef78ecc3bdf8a4b03b80d02aaa080d9501c8", # if checksum omitted, will be generated by DataDeps - post_fetch_method = unpack + joinpath.(LINK, DATA), + "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7", # if checksum omitted, will be generated by DataDeps + # post_fetch_method = unpack )) end """ - alldata(; dir=nothing) + dataset(; dir=nothing, reverse_edges=true) Retrieve the Cora dataset. The output is a named tuple with fields +```juliarepl +julia> keys(Cora.dataset()) +(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed) +``` +In particular, `adjacency_list` is a vector of vector, +where `adjacency_list[i]` will contain the neighbors of node `i` +through outgoing edges. -- `edges` -- `node_labels` -- `directed` +If `reverse_edges=true`, the graph will contain +the reverse of each edge and the graph will be undirected. + +See also [`Cora`](@ref). ## Usage Examples + ```juliarepl -julia> using MLDatasets: Cora +using MLDatasets: Cora -julia> data = Cora.alldata() -(edges = [1 9; 1 436; … ; 2708 1390; 2708 2345], node_labels = [3, 6, 5, 5, 4, 4, 7, 3, 3, 7 … 4, 4, 4, 3, 2, 2, 2, 2, 1, 3], directed = true) +data = Cora.dataset() +train_labels = data.node_labels[data.train_indices] ``` """ -function alldata(; dir=nothing) - edges = readdlm(datafile(DEPNAME, "cora.edges", dir), ',', Int) - @assert all(edges[:,3] .== 1) - edges = edges[:,1:2] - - node_labels = readdlm(datafile(DEPNAME, "cora.node_labels", dir), ',', Int) - node_labels = node_labels[:,2] # first column is just 1:n - - return (; edges=edges, - node_labels=node_labels, - directed=true) -end +dataset(; dir=nothing, reverse_edges=true) = + read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges) + + +end #module -end \ No newline at end of file diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl index 3f43d340..f9b1873e 100644 --- a/src/MLDatasets.jl +++ b/src/MLDatasets.jl @@ -1,7 +1,9 @@ module MLDatasets using Requires +using DelimitedFiles: readdlm using FixedPointNumbers, ColorTypes +using PyCall bytes_to_type(::Type{UInt8}, A::Array{UInt8}) = A bytes_to_type(::Type{N0f8}, A::Array{UInt8}) = reinterpret(N0f8, A) @@ -45,13 +47,29 @@ include("PTBLM/PTBLM.jl") include("UD_English/UD_English.jl") # Graphs -include("Cora/Cora.jl") +include("planetoid.jl") + include("Cora/Cora.jl") + include("PubMed/PubMed.jl") + include("CiteSeer/CiteSeer.jl") function __init__() # initialize optional dependencies @require ImageCore="a09fc81d-aa75-5fe9-8630-4744c3626534" begin global __images_supported__ = true end + + + py""" + import numpy as np + import pickle + + def pyread_planetoid_file(path, name): + out = pickle.load(open(path, "rb"), encoding="latin1") + if name == 'graph': + return out + out = out.todense() if hasattr(out, 'todense') else out + return out + """ end end diff --git a/src/PubMed/PubMed.jl b/src/PubMed/PubMed.jl new file mode 100644 index 00000000..ac29afa7 --- /dev/null +++ b/src/PubMed/PubMed.jl @@ -0,0 +1,79 @@ +export PubMed + + +""" + PubMed + +The PubMed citation network dataset from Ref. [1]. +Nodes represent documents and edges represent citation links. +The dataset is designed for the node classification task. +The task is to predict the category of certain paper. +The dataset is retrieved from Ref. [2]. + +## Interface + +- [`PubMed.dataset`](@ref) + +## References + +[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815) +[2]: [Planetoid](https://github.com/kimiyoung/planetoid) +""" +module PubMed + +using DataDeps +using ..MLDatasets: datafile, read_planetoid_data +using DelimitedFiles: readdlm + +using PyCall + +const DEPNAME = "PubMed" +const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data" +const DOCS = "https://github.com/kimiyoung/planetoid" +const DATA = "ind.pubmed." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"] + +function __init__() + register(DataDep( + DEPNAME, + """ + Dataset: The $DEPNAME dataset. + Website: $DOCS + """, + joinpath.(LINK, DATA), + "0b8bf8e80564611b540655e9cbb8c5900dd3728d4ababe0b990b6f27144bd76c", # if checksum omitted, will be generated by DataDeps + # post_fetch_method = unpack + )) +end + +""" + dataset(; dir=nothing) + +Retrieve the PubMed dataset. The output is a named tuple with fields +```juliarepl +julia> keys(PubMed.dataset()) +(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed) +``` + +In particular, `adjacency_list` is a vector of vector, +where `adjacency_list[i]` will contain the neighbors of node `i` +through outgoing edges. + +If `reverse_edges=true`, the graph will contain +the reverse of each edge and the graph will be undirected. + +See also [`PubMed`](@ref). + +## Usage Examples + +```julia +using MLDatasets: PubMed +data = PubMed.dataset() +train_labels = data.node_labels[data.train_indices] +``` +""" +dataset(; dir=nothing, reverse_edges=true) = + read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges) + + +end #module + diff --git a/src/UD_English/UD_English.jl b/src/UD_English/UD_English.jl index f23a46cc..34de3646 100644 --- a/src/UD_English/UD_English.jl +++ b/src/UD_English/UD_English.jl @@ -70,7 +70,7 @@ module UD_English detail on the Website. """, "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/" .* [TRAINFILE, DEVFILE, TESTFILE], - "f2ccb6da7ec5fb0b617f0e7d7a13f3e292621eadbc324a1b3e7479d50a290177" + "e08d57e95264ac97ca861261e3119e093c054453c5dfc583e2402459504d93b7" )) end end diff --git a/src/planetoid.jl b/src/planetoid.jl new file mode 100644 index 00000000..d6fd8824 --- /dev/null +++ b/src/planetoid.jl @@ -0,0 +1,86 @@ +""" +Read any of the citation network datasets “Cora”, “CiteSeer” and “PubMed” +from the “Revisiting Semi-Supervised Learning with Graph Embeddings” paper. +Nodes represent documents and edges represent citation links. + +Data collected from +https://github.com/kimiyoung/planetoid/raw/master/data +""" +function read_planetoid_data(DEPNAME; dir=nothing, reverse_edges=true) + name = lowercase(DEPNAME) + + x = read_planetoid_file(DEPNAME, "ind.$(name).x", dir) + y = read_planetoid_file(DEPNAME, "ind.$(name).y", dir) + allx = read_planetoid_file(DEPNAME, "ind.$(name).allx", dir) + ally = read_planetoid_file(DEPNAME, "ind.$(name).ally", dir) + tx = read_planetoid_file(DEPNAME, "ind.$(name).tx", dir) + ty = read_planetoid_file(DEPNAME, "ind.$(name).ty", dir) + graph = read_planetoid_file(DEPNAME, "ind.$(name).graph", dir) + test_index = read_planetoid_file(DEPNAME, "ind.$(name).test.index", dir) + + ntrain = size(x, 2) + train_index = 1:ntrain + val_index = ntrain+1:ntrain+500 + sorted_test_index = sort(test_index) + + if name == "citeseer" + # There are some isolated nodes in the Citeseer graph, resulting in + # none consecutive test indices. We need to identify them and add them + # as zero vectors to `tx` and `ty`. + len_test_indices = (maximum(test_index) - minimum(test_index)) + 1 + + tx_ext = zeros(size(tx,1), len_test_indices) + tx_ext[:, sorted_test_index .- minimum(test_index) .+ 1] .= tx + ty_ext = zeros(len_test_indices) + ty_ext[sorted_test_index .- minimum(test_index) .+ 1] = ty + + tx, ty = tx_ext, ty_ext + end + x = hcat(allx, tx) + y = vcat(ally, ty) + x[:, test_index] = x[:, sorted_test_index] + y[test_index] = y[sorted_test_index] + test_index = size(allx,2)+1:size(x,2) + + num_nodes = size(x, 2) + adj_list = [Int[] for i=1:num_nodes] + for (i, neigs) in pairs(graph) # graph is dictionay representing the adjacency list + neigs = unique(neigs) # remove duplicated edges + neigs = filter(x -> x!=i, neigs)# remove self-loops + append!(adj_list[i+1], neigs .+ 1) # convert to 1-indexed + end + if reverse_edges + for (i, neigs) in enumerate(adj_list) + for j in neigs + i ∉ adj_list[j] && push!(adj_list[j], i) + end + end + end + + return (; node_features = x, + node_labels = y, + adjacency_list = adj_list, + train_indices = train_index, + val_indices = val_index, + test_indices = test_index, + num_classes = length(unique(y)), + num_nodes = num_nodes, + num_edges = sum(length.(adj_list)), + directed = reverse_edges != true) +end + +function read_planetoid_file(DEPNAME, name, dir) + filename = datafile(DEPNAME, name, dir) + if endswith(name, "test.index") + out = 1 .+ vec(readdlm(filename, Int)) + else + out = py"pyread_planetoid_file"(filename, name) + if out isa Matrix + out = collect(out') + end + end + if endswith(name, "y") + out = map(y->y[1], argmax(out, dims=1)) |> vec + end + return out +end diff --git a/test/runtests.jl b/test/runtests.jl index 8d46708b..bd0156ae 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,9 @@ tests = [ "tst_fashion_mnist.jl", "tst_svhn2.jl", "tst_emnist.jl", - "tst_cora.jl" + "tst_cora.jl", + "tst_citeseer.jl", + "tst_pubmed.jl", ] for t in tests diff --git a/test/tst_citeseer.jl b/test/tst_citeseer.jl new file mode 100644 index 00000000..74e70581 --- /dev/null +++ b/test/tst_citeseer.jl @@ -0,0 +1,19 @@ +data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do + datadep"CiteSeer" +end + +@testset "CiteSeer" begin + data = CiteSeer.dataset() + + @test data.num_nodes == 3327 + @test data.num_edges == 9104 + @test size(data.node_features) == (3703, data.num_nodes) + @test size(data.node_labels) == (data.num_nodes,) + @test size(data.train_indices) == (120,) + @test size(data.val_indices) == (500,) + @test size(data.test_indices) == (1015,) + @test size(data.adjacency_list) == (data.num_nodes, ) + @test sum(length.(data.adjacency_list)) == (data.num_edges) + @test minimum(minimum.(data.adjacency_list; init=1000)) == 1 + @test maximum(maximum.(data.adjacency_list; init=1000)) == data.num_nodes +end diff --git a/test/tst_cora.jl b/test/tst_cora.jl index ed528135..5f92fd02 100644 --- a/test/tst_cora.jl +++ b/test/tst_cora.jl @@ -3,12 +3,18 @@ data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do end @testset "Cora" begin - data = Cora.alldata() - @test data isa NamedTuple - - @test data.edges isa Matrix{Int} - @test size(data.edges) == (5429, 2) - @test data.node_labels isa Vector{Int} - @test size(data.node_labels) == (2708,) - @test data.directed + data = Cora.dataset() + + @test data.num_nodes == 2708 + @test data.num_edges == 10556 + @test data.directed == true + @test size(data.node_features) == (1433, data.num_nodes) + @test size(data.node_labels) == (data.num_nodes,) + @test size(data.train_indices) == (140,) + @test size(data.val_indices) == (500,) + @test size(data.test_indices) == (1000,) + @test size(data.adjacency_list) == (data.num_nodes, ) + @test sum(length.(data.adjacency_list)) == (data.num_edges) + @test minimum(minimum.(data.adjacency_list)) == 1 + @test maximum(maximum.(data.adjacency_list)) == data.num_nodes end diff --git a/test/tst_pubmed.jl b/test/tst_pubmed.jl new file mode 100644 index 00000000..6e223649 --- /dev/null +++ b/test/tst_pubmed.jl @@ -0,0 +1,21 @@ +data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do + datadep"PubMed" +end + +@testset "PubMed" begin + data = PubMed.dataset() + + @test data.num_nodes == 19717 + @test data.num_edges == 88648 + @test data.directed == true + @test size(data.node_features) == (500, data.num_nodes) + @test size(data.node_labels) == (data.num_nodes,) + @test size(data.train_indices) == (60,) + @test size(data.val_indices) == (500,) + @test size(data.test_indices) == (1000,) + @test size(data.adjacency_list) == (data.num_nodes, ) + @test sum(length.(data.adjacency_list)) == (data.num_edges) + @test minimum(minimum.(data.adjacency_list)) == 1 + @test maximum(maximum.(data.adjacency_list)) == data.num_nodes + +end From fbfa33784cb24ec580e94ce301c3a77123d4bb17 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 14:19:54 +0200 Subject: [PATCH 2/7] don't use system Python --- .github/workflows/UnitTest.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml index eaa07e75..1f26a784 100644 --- a/.github/workflows/UnitTest.yml +++ b/.github/workflows/UnitTest.yml @@ -18,7 +18,8 @@ jobs: matrix: julia-version: ['1.0', '1', 'nightly'] os: [ubuntu-latest, windows-latest, macOS-latest] - + env: + PYTHON: "" steps: - uses: actions/checkout@v1.0.0 - name: "Set up Julia" From 49a3415b53992b7d251f2d074c8e25d9758cee29 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 14:33:36 +0200 Subject: [PATCH 3/7] pyimport_conda --- src/MLDatasets.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl index f9b1873e..7a59410a 100644 --- a/src/MLDatasets.jl +++ b/src/MLDatasets.jl @@ -58,9 +58,10 @@ function __init__() global __images_supported__ = true end + # install scipy if not already there + pyimport_conda("scipy", "scipy") py""" - import numpy as np import pickle def pyread_planetoid_file(path, name): From b40159908f0d944f5b057d57598852a9eb57fa8a Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 14:35:25 +0200 Subject: [PATCH 4/7] fix documenter --- .github/workflows/Documenter.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml index 73ba2ea1..c961e9b3 100644 --- a/.github/workflows/Documenter.yml +++ b/.github/workflows/Documenter.yml @@ -10,6 +10,8 @@ on: jobs: build: runs-on: ubuntu-latest + env: + PYTHON: "" steps: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest From cafc860d690ff950a1a427562ed9b036b2aa83f6 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 14:47:11 +0200 Subject: [PATCH 5/7] fixes --- docs/make.jl | 4 ++++ docs/src/utils.md | 4 ++++ test/tst_citeseer.jl | 1 + test/tst_cora.jl | 2 +- test/tst_pubmed.jl | 2 +- 5 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 docs/src/utils.md diff --git a/docs/make.jl b/docs/make.jl index c6e33b08..0cb021dd 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -19,6 +19,7 @@ makedocs( ), authors = "Hiroyuki Shindo, Christof Stocker", + # TODO: automatize `pages` creation pages = Any[ "Home" => "index.md", "Available Datasets" => Any[ @@ -40,10 +41,13 @@ makedocs( ], "Graphs" => Any[ + "CiteSeer" => "datasets/CiteSeer.md", "Cora" => "datasets/Cora.md", + "PubMed" => "datasets/PubMed.md", ], ], + "Utis" -> "utils.md", "LICENSE.md", ], strict = true diff --git a/docs/src/utils.md b/docs/src/utils.md new file mode 100644 index 00000000..a232e161 --- /dev/null +++ b/docs/src/utils.md @@ -0,0 +1,4 @@ +# Utils +```@docs +MLDatasets.read_planetoid_data +``` \ No newline at end of file diff --git a/test/tst_citeseer.jl b/test/tst_citeseer.jl index 74e70581..3633f9b7 100644 --- a/test/tst_citeseer.jl +++ b/test/tst_citeseer.jl @@ -7,6 +7,7 @@ end @test data.num_nodes == 3327 @test data.num_edges == 9104 + @test data.directed == false @test size(data.node_features) == (3703, data.num_nodes) @test size(data.node_labels) == (data.num_nodes,) @test size(data.train_indices) == (120,) diff --git a/test/tst_cora.jl b/test/tst_cora.jl index 5f92fd02..8e5e8b0c 100644 --- a/test/tst_cora.jl +++ b/test/tst_cora.jl @@ -7,7 +7,7 @@ end @test data.num_nodes == 2708 @test data.num_edges == 10556 - @test data.directed == true + @test data.directed == false @test size(data.node_features) == (1433, data.num_nodes) @test size(data.node_labels) == (data.num_nodes,) @test size(data.train_indices) == (140,) diff --git a/test/tst_pubmed.jl b/test/tst_pubmed.jl index 6e223649..30603b4a 100644 --- a/test/tst_pubmed.jl +++ b/test/tst_pubmed.jl @@ -7,7 +7,7 @@ end @test data.num_nodes == 19717 @test data.num_edges == 88648 - @test data.directed == true + @test data.directed == false @test size(data.node_features) == (500, data.num_nodes) @test size(data.node_labels) == (data.num_nodes,) @test size(data.train_indices) == (60,) From 2c5d64c1a08cf7110772a49700966fc3a1079eec Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 14:52:35 +0200 Subject: [PATCH 6/7] doc fix --- docs/make.jl | 2 +- docs/src/utils.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 0cb021dd..71aa5542 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -47,7 +47,7 @@ makedocs( ], ], - "Utis" -> "utils.md", + "Utils" => "utils.md", "LICENSE.md", ], strict = true diff --git a/docs/src/utils.md b/docs/src/utils.md index a232e161..ecd2791a 100644 --- a/docs/src/utils.md +++ b/docs/src/utils.md @@ -1,4 +1,5 @@ # Utils + ```@docs MLDatasets.read_planetoid_data ``` \ No newline at end of file From d40696a8a120285272ae03660b5af8b4122d3549 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 27 Jul 2021 16:24:27 +0200 Subject: [PATCH 7/7] fixes --- src/CiteSeer/CiteSeer.jl | 2 +- src/Cora/Cora.jl | 2 +- src/PubMed/PubMed.jl | 2 +- test/tst_citeseer.jl | 6 ++++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/CiteSeer/CiteSeer.jl b/src/CiteSeer/CiteSeer.jl index 9c2596f3..24d35f01 100644 --- a/src/CiteSeer/CiteSeer.jl +++ b/src/CiteSeer/CiteSeer.jl @@ -39,7 +39,7 @@ function __init__() Dataset: The $DEPNAME dataset. Website: $DOCS """, - joinpath.(LINK, DATA), + map(x -> "$LINK/$x", DATA), "7f7ec4df97215c573eee316de35754d89382011dfd9fb2b954a4a491057e3eb3", # if checksum omitted, will be generated by DataDeps # post_fetch_method = unpack )) diff --git a/src/Cora/Cora.jl b/src/Cora/Cora.jl index 2a46050e..01faad31 100644 --- a/src/Cora/Cora.jl +++ b/src/Cora/Cora.jl @@ -55,7 +55,7 @@ function __init__() Dataset: The $DEPNAME dataset. Website: $DOCS """, - joinpath.(LINK, DATA), + map(x -> "$LINK/$x", DATA), "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7", # if checksum omitted, will be generated by DataDeps # post_fetch_method = unpack )) diff --git a/src/PubMed/PubMed.jl b/src/PubMed/PubMed.jl index ac29afa7..ea9ae482 100644 --- a/src/PubMed/PubMed.jl +++ b/src/PubMed/PubMed.jl @@ -39,7 +39,7 @@ function __init__() Dataset: The $DEPNAME dataset. Website: $DOCS """, - joinpath.(LINK, DATA), + map(x -> "$LINK/$x", DATA), "0b8bf8e80564611b540655e9cbb8c5900dd3728d4ababe0b990b6f27144bd76c", # if checksum omitted, will be generated by DataDeps # post_fetch_method = unpack )) diff --git a/test/tst_citeseer.jl b/test/tst_citeseer.jl index 3633f9b7..6a6edc00 100644 --- a/test/tst_citeseer.jl +++ b/test/tst_citeseer.jl @@ -15,6 +15,8 @@ end @test size(data.test_indices) == (1015,) @test size(data.adjacency_list) == (data.num_nodes, ) @test sum(length.(data.adjacency_list)) == (data.num_edges) - @test minimum(minimum.(data.adjacency_list; init=1000)) == 1 - @test maximum(maximum.(data.adjacency_list; init=1000)) == data.num_nodes + if VERSION >= v"1.6.0" + @test minimum(minimum.(data.adjacency_list; init=1000)) == 1 + @test maximum(maximum.(data.adjacency_list; init=1000)) == data.num_nodes + end end