From 5f5495f90561bbe779d72692d21bf4940bed62db Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 08:47:40 +0200
Subject: [PATCH 1/7] update cora

add PuMed and CiteSeer
---
 Project.toml                  |  4 +-
 README.md                     |  4 +-
 docs/src/datasets/CiteSeer.md | 11 +++++
 docs/src/datasets/Cora.md     |  2 +-
 docs/src/datasets/PubMed.md   | 11 +++++
 src/CiteSeer/CiteSeer.jl      | 79 +++++++++++++++++++++++++++++++
 src/Cora/Cora.jl              | 88 +++++++++++++++++++++++------------
 src/MLDatasets.jl             | 20 +++++++-
 src/PubMed/PubMed.jl          | 79 +++++++++++++++++++++++++++++++
 src/UD_English/UD_English.jl  |  2 +-
 src/planetoid.jl              | 86 ++++++++++++++++++++++++++++++++++
 test/runtests.jl              |  4 +-
 test/tst_citeseer.jl          | 19 ++++++++
 test/tst_cora.jl              | 22 +++++----
 test/tst_pubmed.jl            | 21 +++++++++
 15 files changed, 407 insertions(+), 45 deletions(-)
 create mode 100644 docs/src/datasets/CiteSeer.md
 create mode 100644 docs/src/datasets/PubMed.md
 create mode 100644 src/CiteSeer/CiteSeer.jl
 create mode 100644 src/PubMed/PubMed.jl
 create mode 100644 src/planetoid.jl
 create mode 100644 test/tst_citeseer.jl
 create mode 100644 test/tst_pubmed.jl

diff --git a/Project.toml b/Project.toml
index 6795323b..e316d0ba 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "MLDatasets"
 uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
-version = "0.5.8"
+version = "0.5.9"
 
 [deps]
 BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
@@ -10,6 +10,7 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
+PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]
@@ -20,6 +21,7 @@ FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
 GZip = "0.5"
 ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
 MAT = "0.7, 0.8, 0.9, 0.10"
+PyCall = "1"
 Requires = "1"
 julia = "1"
 
diff --git a/README.md b/README.md
index f99682e4..bd9b7028 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Each dataset has its own dedicated sub-module.
 Find below a list of available datasets and links to their documentation.
 
 #### Vision
-  - [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/)
+  - [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR10/)
   - [CIFAR100](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/)
   - [EMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/EMNIST/)
   - [FashionMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/FashionMNIST/)
@@ -38,7 +38,9 @@ Find below a list of available datasets and links to their documentation.
   - [UD_English](https://juliaml.github.io/MLDatasets.jl/latest/datasets/UD_English/)
 
 #### Graphs
+  - [CiteSeer](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CiteSeer/)
   - [Cora](https://juliaml.github.io/MLDatasets.jl/latest/datasets/Cora/)
+  - [PubMed](https://juliaml.github.io/MLDatasets.jl/latest/datasets/PubMed/)
 
 
 
diff --git a/docs/src/datasets/CiteSeer.md b/docs/src/datasets/CiteSeer.md
new file mode 100644
index 00000000..76821eb7
--- /dev/null
+++ b/docs/src/datasets/CiteSeer.md
@@ -0,0 +1,11 @@
+# CiteSeer
+
+```@docs
+CiteSeer
+```
+
+## API reference
+
+```@docs
+CiteSeer.dataset
+```
diff --git a/docs/src/datasets/Cora.md b/docs/src/datasets/Cora.md
index ad93a062..c6188874 100644
--- a/docs/src/datasets/Cora.md
+++ b/docs/src/datasets/Cora.md
@@ -7,5 +7,5 @@ Cora
 ## API reference
 
 ```@docs
-Cora.alldata
+Cora.dataset
 ```
diff --git a/docs/src/datasets/PubMed.md b/docs/src/datasets/PubMed.md
new file mode 100644
index 00000000..8c042c0b
--- /dev/null
+++ b/docs/src/datasets/PubMed.md
@@ -0,0 +1,11 @@
+# PubMed
+
+```@docs
+PubMed
+```
+
+## API reference
+
+```@docs
+PubMed.dataset
+```
diff --git a/src/CiteSeer/CiteSeer.jl b/src/CiteSeer/CiteSeer.jl
new file mode 100644
index 00000000..9c2596f3
--- /dev/null
+++ b/src/CiteSeer/CiteSeer.jl
@@ -0,0 +1,79 @@
+export CiteSeer
+
+
+"""
+    CiteSeer
+
+The CiteSeer citation network dataset from Ref. [1].
+Nodes represent documents and edges represent citation links.
+The dataset is designed for the node classification task. 
+The task is to predict the category of certain paper.
+The dataset is retrieved from Ref. [2].
+
+## Interface
+
+- [`CiteSeer.dataset`](@ref)
+
+## References
+
+[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
+[2]: [Planetoid](https://github.com/kimiyoung/planetoid)
+"""
+module CiteSeer
+
+using DataDeps
+using ..MLDatasets: datafile, read_planetoid_data
+using DelimitedFiles: readdlm
+
+using PyCall
+
+const DEPNAME = "CiteSeer"
+const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
+const DOCS = "https://github.com/kimiyoung/planetoid"
+const DATA = "ind.citeseer." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
+
+function __init__()
+    register(DataDep(
+        DEPNAME,
+        """
+        Dataset: The $DEPNAME dataset.
+        Website: $DOCS
+        """,
+        joinpath.(LINK, DATA),
+        "7f7ec4df97215c573eee316de35754d89382011dfd9fb2b954a4a491057e3eb3",  # if checksum omitted, will be generated by DataDeps
+        # post_fetch_method = unpack
+    ))
+end
+
+"""
+    dataset(; dir=nothing, reverse_edges=true)
+
+Retrieve the CiteSeer dataset. The output is a named tuple with fields
+```juliarepl
+julia> keys(CiteSeer.dataset())
+(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
+```
+
+In particular, `adjacency_list` is a vector of vector, 
+where `adjacency_list[i]` will contain the neighbors of node `i`
+through outgoing edges.
+
+If `reverse_edges=true`, the graph will contain
+the reverse of each edge and the graph will be undirected.
+
+See also [`CiteSeer`](@ref).
+
+## Usage Examples
+
+```julia
+using MLDatasets: CiteSeer
+data = CiteSeer.dataset()
+train_labels = data.node_labels[data.train_indices]
+```
+"""
+dataset(; dir=nothing, reverse_edges=true) = 
+    read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
+
+
+end #module 
+
diff --git a/src/Cora/Cora.jl b/src/Cora/Cora.jl
index f7926788..2a46050e 100644
--- a/src/Cora/Cora.jl
+++ b/src/Cora/Cora.jl
@@ -1,27 +1,52 @@
 export Cora
 
+
 """
     Cora
 
-The full Cora citation network dataset from the
-`"Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via
-Ranking" <https://arxiv.org/abs/1707.03815>`_ paper.
+The Cora citation network dataset from Ref. [1].
 Nodes represent documents and edges represent citation links.
+Each node has a predefined feature with 1433 dimensions. 
+The dataset is designed for the node classification task. 
+The task is to predict the category of certain paper.
+The dataset is retrieved from Ref. [2].
+
+## Statistics 
+
+- Nodes: 2708
+- Edges: 10556
+- Number of Classes: 7
+- Label split:
+    - Train:  140
+    - Val:    500
+    - Test:  1000
+
+The split is the one used in the original paper [1] and 
+doesn't consider all nodes.
 
 ## Interface
 
-- [`Cora.alldata`](@ref)
+- [`Cora.dataset`](@ref)
+
+## References
+
+[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
+[2]: [Planetoid](https://github.com/kimiyoung/planetoid
 """
 module Cora
 
 using DataDeps
-using ..MLDatasets: datafile
+using ..MLDatasets: datafile, read_planetoid_data
 using DelimitedFiles: readdlm
 
-const DEPNAME = "Cora"
-const LINK = "http://nrvis.com/download/data/labeled/cora.zip"
-const DOCS = "http://networkrepository.com/cora.php"
+using PyCall
 
+const DEPNAME = "Cora"
+# LINK = "https://github.com/shchur/gnn-benchmark/raw/master/data/npz"
+# LINK = "https://github.com/abojchevski/graph2gauss/raw/master/data/"
+const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
+const DOCS = "https://github.com/kimiyoung/planetoid"
+const DATA = "ind.cora." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
 
 function __init__()
     register(DataDep(
@@ -30,40 +55,41 @@ function __init__()
         Dataset: The $DEPNAME dataset.
         Website: $DOCS
         """,
-        LINK,
-        "a3e3a37c34c9385fe8089bbc7c17ef78ecc3bdf8a4b03b80d02aaa080d9501c8",  # if checksum omitted, will be generated by DataDeps
-        post_fetch_method = unpack
+        joinpath.(LINK, DATA),
+        "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7",  # if checksum omitted, will be generated by DataDeps
+        # post_fetch_method = unpack
     ))
 end
 
 """
-    alldata(; dir=nothing)
+    dataset(; dir=nothing, reverse_edges=true)
 
 Retrieve the Cora dataset. The output is a named tuple with fields
+```juliarepl
+julia> keys(Cora.dataset())
+(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
+```
+In particular, `adjacency_list` is a vector of vector, 
+where `adjacency_list[i]` will contain the neighbors of node `i`
+through outgoing edges.
 
-- `edges`
-- `node_labels`
-- `directed`
+If `reverse_edges=true`, the graph will contain
+the reverse of each edge and the graph will be undirected.
+
+See also [`Cora`](@ref).
 
 ## Usage Examples
+
 ```juliarepl
-julia> using MLDatasets: Cora
+using MLDatasets: Cora
 
-julia> data = Cora.alldata()
-(edges = [1 9; 1 436; … ; 2708 1390; 2708 2345], node_labels = [3, 6, 5, 5, 4, 4, 7, 3, 3, 7  …  4, 4, 4, 3, 2, 2, 2, 2, 1, 3], directed = true)
+data = Cora.dataset()
+train_labels = data.node_labels[data.train_indices]
 ```
 """
-function alldata(; dir=nothing)
-    edges = readdlm(datafile(DEPNAME, "cora.edges", dir), ',', Int)
-    @assert all(edges[:,3] .== 1)
-    edges = edges[:,1:2]
-    
-    node_labels = readdlm(datafile(DEPNAME, "cora.node_labels", dir), ',', Int)
-    node_labels = node_labels[:,2] # first column is just 1:n
-    
-    return (; edges=edges, 
-              node_labels=node_labels, 
-              directed=true)
-end
+dataset(; dir=nothing, reverse_edges=true) =
+    read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
+
+
+end #module 
 
-end
\ No newline at end of file
diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
index 3f43d340..f9b1873e 100644
--- a/src/MLDatasets.jl
+++ b/src/MLDatasets.jl
@@ -1,7 +1,9 @@
 module MLDatasets
 
 using Requires
+using DelimitedFiles: readdlm
 using FixedPointNumbers, ColorTypes
+using PyCall
 
 bytes_to_type(::Type{UInt8}, A::Array{UInt8}) = A
 bytes_to_type(::Type{N0f8}, A::Array{UInt8}) = reinterpret(N0f8, A)
@@ -45,13 +47,29 @@ include("PTBLM/PTBLM.jl")
 include("UD_English/UD_English.jl")
 
 # Graphs
-include("Cora/Cora.jl")
+include("planetoid.jl")
+    include("Cora/Cora.jl")
+    include("PubMed/PubMed.jl")
+    include("CiteSeer/CiteSeer.jl")
 
 function __init__()
     # initialize optional dependencies
     @require ImageCore="a09fc81d-aa75-5fe9-8630-4744c3626534" begin
         global __images_supported__ = true
     end
+
+
+    py"""
+    import numpy as np
+    import pickle
+
+    def pyread_planetoid_file(path, name):
+        out = pickle.load(open(path, "rb"), encoding="latin1")
+        if name == 'graph':
+            return out
+        out = out.todense() if hasattr(out, 'todense') else out
+        return out
+    """
 end
 
 end
diff --git a/src/PubMed/PubMed.jl b/src/PubMed/PubMed.jl
new file mode 100644
index 00000000..ac29afa7
--- /dev/null
+++ b/src/PubMed/PubMed.jl
@@ -0,0 +1,79 @@
+export PubMed
+
+
+"""
+    PubMed
+
+The PubMed citation network dataset from Ref. [1].
+Nodes represent documents and edges represent citation links.
+The dataset is designed for the node classification task. 
+The task is to predict the category of certain paper.
+The dataset is retrieved from Ref. [2].
+
+## Interface
+
+- [`PubMed.dataset`](@ref)
+
+## References
+
+[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
+[2]: [Planetoid](https://github.com/kimiyoung/planetoid)
+"""
+module PubMed
+
+using DataDeps
+using ..MLDatasets: datafile, read_planetoid_data
+using DelimitedFiles: readdlm
+
+using PyCall
+
+const DEPNAME = "PubMed"
+const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
+const DOCS = "https://github.com/kimiyoung/planetoid"
+const DATA = "ind.pubmed." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
+
+function __init__()
+    register(DataDep(
+        DEPNAME,
+        """
+        Dataset: The $DEPNAME dataset.
+        Website: $DOCS
+        """,
+        joinpath.(LINK, DATA),
+        "0b8bf8e80564611b540655e9cbb8c5900dd3728d4ababe0b990b6f27144bd76c",  # if checksum omitted, will be generated by DataDeps
+        # post_fetch_method = unpack
+    ))
+end
+
+"""
+    dataset(; dir=nothing)
+
+Retrieve the PubMed dataset. The output is a named tuple with fields
+```juliarepl
+julia> keys(PubMed.dataset())
+(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
+```
+
+In particular, `adjacency_list` is a vector of vector, 
+where `adjacency_list[i]` will contain the neighbors of node `i`
+through outgoing edges.
+
+If `reverse_edges=true`, the graph will contain
+the reverse of each edge and the graph will be undirected.
+
+See also [`PubMed`](@ref).
+
+## Usage Examples
+
+```julia
+using MLDatasets: PubMed
+data = PubMed.dataset()
+train_labels = data.node_labels[data.train_indices]
+```
+"""
+dataset(; dir=nothing, reverse_edges=true) = 
+    read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
+
+
+end #module 
+
diff --git a/src/UD_English/UD_English.jl b/src/UD_English/UD_English.jl
index f23a46cc..34de3646 100644
--- a/src/UD_English/UD_English.jl
+++ b/src/UD_English/UD_English.jl
@@ -70,7 +70,7 @@ module UD_English
             detail on the Website.
             """,
             "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/" .* [TRAINFILE, DEVFILE, TESTFILE],
-            "f2ccb6da7ec5fb0b617f0e7d7a13f3e292621eadbc324a1b3e7479d50a290177"
+            "e08d57e95264ac97ca861261e3119e093c054453c5dfc583e2402459504d93b7"
         ))
     end
 end
diff --git a/src/planetoid.jl b/src/planetoid.jl
new file mode 100644
index 00000000..d6fd8824
--- /dev/null
+++ b/src/planetoid.jl
@@ -0,0 +1,86 @@
+"""
+Read any of the citation network datasets “Cora”, “CiteSeer” and “PubMed” 
+from the “Revisiting Semi-Supervised Learning with Graph Embeddings” paper. 
+Nodes represent documents and edges represent citation links. 
+
+Data collected from 
+https://github.com/kimiyoung/planetoid/raw/master/data
+"""
+function read_planetoid_data(DEPNAME; dir=nothing, reverse_edges=true)
+    name = lowercase(DEPNAME)
+    
+    x  = read_planetoid_file(DEPNAME, "ind.$(name).x", dir)
+    y  = read_planetoid_file(DEPNAME, "ind.$(name).y", dir)
+    allx  = read_planetoid_file(DEPNAME, "ind.$(name).allx", dir)
+    ally  = read_planetoid_file(DEPNAME, "ind.$(name).ally", dir)
+    tx  = read_planetoid_file(DEPNAME, "ind.$(name).tx", dir)
+    ty  = read_planetoid_file(DEPNAME, "ind.$(name).ty", dir) 
+    graph = read_planetoid_file(DEPNAME, "ind.$(name).graph", dir)
+    test_index = read_planetoid_file(DEPNAME, "ind.$(name).test.index", dir)
+
+    ntrain = size(x, 2)
+    train_index = 1:ntrain
+    val_index = ntrain+1:ntrain+500
+    sorted_test_index = sort(test_index)
+
+    if name == "citeseer"
+        # There are some isolated nodes in the Citeseer graph, resulting in
+        # none consecutive test indices. We need to identify them and add them
+        # as zero vectors to `tx` and `ty`.
+        len_test_indices = (maximum(test_index) - minimum(test_index)) + 1
+
+        tx_ext = zeros(size(tx,1), len_test_indices)
+        tx_ext[:, sorted_test_index .- minimum(test_index) .+ 1] .= tx
+        ty_ext = zeros(len_test_indices)
+        ty_ext[sorted_test_index .- minimum(test_index) .+ 1] = ty
+
+        tx, ty = tx_ext, ty_ext
+    end
+    x = hcat(allx, tx)
+    y = vcat(ally, ty)
+    x[:, test_index] = x[:, sorted_test_index]
+    y[test_index] = y[sorted_test_index]
+    test_index = size(allx,2)+1:size(x,2)
+
+    num_nodes = size(x, 2)
+    adj_list = [Int[] for i=1:num_nodes]
+    for (i, neigs) in pairs(graph) # graph is dictionay representing the adjacency list
+        neigs = unique(neigs) # remove duplicated edges
+        neigs = filter(x -> x!=i, neigs)# remove self-loops
+        append!(adj_list[i+1], neigs .+ 1) # convert to 1-indexed
+    end
+    if reverse_edges
+        for (i, neigs) in enumerate(adj_list)
+            for j in neigs
+                i ∉ adj_list[j] && push!(adj_list[j], i)
+            end
+        end
+    end
+
+    return (; node_features = x,
+              node_labels = y,
+              adjacency_list = adj_list,
+              train_indices = train_index,
+              val_indices = val_index, 
+              test_indices = test_index,
+              num_classes = length(unique(y)),
+              num_nodes = num_nodes, 
+              num_edges = sum(length.(adj_list)),
+              directed = reverse_edges != true)
+end
+
+function read_planetoid_file(DEPNAME, name, dir)
+    filename = datafile(DEPNAME, name, dir)
+    if endswith(name, "test.index")
+        out = 1 .+ vec(readdlm(filename, Int))
+    else
+        out = py"pyread_planetoid_file"(filename, name)
+        if out isa Matrix
+            out = collect(out')
+        end
+    end
+    if endswith(name, "y")
+        out = map(y->y[1], argmax(out, dims=1)) |> vec
+    end
+    return out
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 8d46708b..bd0156ae 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,7 +15,9 @@ tests = [
     "tst_fashion_mnist.jl",
     "tst_svhn2.jl",
     "tst_emnist.jl",
-    "tst_cora.jl"
+    "tst_cora.jl",
+    "tst_citeseer.jl",
+    "tst_pubmed.jl",
 ]
 
 for t in tests
diff --git a/test/tst_citeseer.jl b/test/tst_citeseer.jl
new file mode 100644
index 00000000..74e70581
--- /dev/null
+++ b/test/tst_citeseer.jl
@@ -0,0 +1,19 @@
+data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
+    datadep"CiteSeer"
+end
+
+@testset "CiteSeer" begin
+    data  = CiteSeer.dataset()
+    
+    @test data.num_nodes == 3327
+    @test data.num_edges == 9104
+    @test size(data.node_features) == (3703, data.num_nodes)
+    @test size(data.node_labels) == (data.num_nodes,)
+    @test size(data.train_indices) == (120,)
+    @test size(data.val_indices) == (500,)
+    @test size(data.test_indices) == (1015,)
+    @test size(data.adjacency_list) == (data.num_nodes, )
+    @test sum(length.(data.adjacency_list)) == (data.num_edges)
+    @test minimum(minimum.(data.adjacency_list; init=1000)) == 1
+    @test maximum(maximum.(data.adjacency_list; init=1000)) == data.num_nodes
+end
diff --git a/test/tst_cora.jl b/test/tst_cora.jl
index ed528135..5f92fd02 100644
--- a/test/tst_cora.jl
+++ b/test/tst_cora.jl
@@ -3,12 +3,18 @@ data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
 end
 
 @testset "Cora" begin
-    data  = Cora.alldata()
-    @test data isa NamedTuple
-
-    @test data.edges isa Matrix{Int}
-    @test size(data.edges) == (5429, 2)
-    @test data.node_labels isa Vector{Int}
-    @test size(data.node_labels) == (2708,)
-    @test data.directed
+    data  = Cora.dataset()
+    
+    @test data.num_nodes == 2708
+    @test data.num_edges == 10556
+    @test data.directed == true
+    @test size(data.node_features) == (1433, data.num_nodes)
+    @test size(data.node_labels) == (data.num_nodes,)
+    @test size(data.train_indices) == (140,)
+    @test size(data.val_indices) == (500,)
+    @test size(data.test_indices) == (1000,)
+    @test size(data.adjacency_list) == (data.num_nodes, )
+    @test sum(length.(data.adjacency_list)) == (data.num_edges)
+    @test minimum(minimum.(data.adjacency_list)) == 1
+    @test maximum(maximum.(data.adjacency_list)) == data.num_nodes
 end
diff --git a/test/tst_pubmed.jl b/test/tst_pubmed.jl
new file mode 100644
index 00000000..6e223649
--- /dev/null
+++ b/test/tst_pubmed.jl
@@ -0,0 +1,21 @@
+data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
+    datadep"PubMed"
+end
+
+@testset "PubMed" begin
+    data  = PubMed.dataset()
+    
+    @test data.num_nodes == 19717
+    @test data.num_edges == 88648
+    @test data.directed == true
+    @test size(data.node_features) == (500, data.num_nodes)
+    @test size(data.node_labels) == (data.num_nodes,)
+    @test size(data.train_indices) == (60,)
+    @test size(data.val_indices) == (500,)
+    @test size(data.test_indices) == (1000,)
+    @test size(data.adjacency_list) == (data.num_nodes, )
+    @test sum(length.(data.adjacency_list)) == (data.num_edges)
+    @test minimum(minimum.(data.adjacency_list)) == 1
+    @test maximum(maximum.(data.adjacency_list)) == data.num_nodes
+
+end

From fbfa33784cb24ec580e94ce301c3a77123d4bb17 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 14:19:54 +0200
Subject: [PATCH 2/7] don't use system Python

---
 .github/workflows/UnitTest.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index eaa07e75..1f26a784 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -18,7 +18,8 @@ jobs:
       matrix:
         julia-version: ['1.0', '1', 'nightly']
         os: [ubuntu-latest, windows-latest, macOS-latest]
-
+    env:
+      PYTHON: ""
     steps:
       - uses: actions/checkout@v1.0.0
       - name: "Set up Julia"

From 49a3415b53992b7d251f2d074c8e25d9758cee29 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 14:33:36 +0200
Subject: [PATCH 3/7] pyimport_conda

---
 src/MLDatasets.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
index f9b1873e..7a59410a 100644
--- a/src/MLDatasets.jl
+++ b/src/MLDatasets.jl
@@ -58,9 +58,10 @@ function __init__()
         global __images_supported__ = true
     end
 
+    # install scipy if not already there
+    pyimport_conda("scipy", "scipy")
 
     py"""
-    import numpy as np
     import pickle
 
     def pyread_planetoid_file(path, name):

From b40159908f0d944f5b057d57598852a9eb57fa8a Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 14:35:25 +0200
Subject: [PATCH 4/7] fix documenter

---
 .github/workflows/Documenter.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
index 73ba2ea1..c961e9b3 100644
--- a/.github/workflows/Documenter.yml
+++ b/.github/workflows/Documenter.yml
@@ -10,6 +10,8 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+    env:
+      PYTHON: ""
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest

From cafc860d690ff950a1a427562ed9b036b2aa83f6 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 14:47:11 +0200
Subject: [PATCH 5/7] fixes

---
 docs/make.jl         | 4 ++++
 docs/src/utils.md    | 4 ++++
 test/tst_citeseer.jl | 1 +
 test/tst_cora.jl     | 2 +-
 test/tst_pubmed.jl   | 2 +-
 5 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 docs/src/utils.md

diff --git a/docs/make.jl b/docs/make.jl
index c6e33b08..0cb021dd 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -19,6 +19,7 @@ makedocs(
     ),
 
     authors = "Hiroyuki Shindo, Christof Stocker",
+    # TODO: automatize `pages` creation
     pages = Any[
         "Home" => "index.md",
         "Available Datasets" => Any[
@@ -40,10 +41,13 @@ makedocs(
             ],
 
             "Graphs" => Any[
+                "CiteSeer" => "datasets/CiteSeer.md",
                 "Cora" => "datasets/Cora.md",
+                "PubMed" => "datasets/PubMed.md",
             ],
 
         ],
+        "Utis" -> "utils.md",
         "LICENSE.md",
     ],
     strict = true
diff --git a/docs/src/utils.md b/docs/src/utils.md
new file mode 100644
index 00000000..a232e161
--- /dev/null
+++ b/docs/src/utils.md
@@ -0,0 +1,4 @@
+# Utils
+```@docs
+MLDatasets.read_planetoid_data
+```
\ No newline at end of file
diff --git a/test/tst_citeseer.jl b/test/tst_citeseer.jl
index 74e70581..3633f9b7 100644
--- a/test/tst_citeseer.jl
+++ b/test/tst_citeseer.jl
@@ -7,6 +7,7 @@ end
     
     @test data.num_nodes == 3327
     @test data.num_edges == 9104
+    @test data.directed == false
     @test size(data.node_features) == (3703, data.num_nodes)
     @test size(data.node_labels) == (data.num_nodes,)
     @test size(data.train_indices) == (120,)
diff --git a/test/tst_cora.jl b/test/tst_cora.jl
index 5f92fd02..8e5e8b0c 100644
--- a/test/tst_cora.jl
+++ b/test/tst_cora.jl
@@ -7,7 +7,7 @@ end
     
     @test data.num_nodes == 2708
     @test data.num_edges == 10556
-    @test data.directed == true
+    @test data.directed == false
     @test size(data.node_features) == (1433, data.num_nodes)
     @test size(data.node_labels) == (data.num_nodes,)
     @test size(data.train_indices) == (140,)
diff --git a/test/tst_pubmed.jl b/test/tst_pubmed.jl
index 6e223649..30603b4a 100644
--- a/test/tst_pubmed.jl
+++ b/test/tst_pubmed.jl
@@ -7,7 +7,7 @@ end
     
     @test data.num_nodes == 19717
     @test data.num_edges == 88648
-    @test data.directed == true
+    @test data.directed == false
     @test size(data.node_features) == (500, data.num_nodes)
     @test size(data.node_labels) == (data.num_nodes,)
     @test size(data.train_indices) == (60,)

From 2c5d64c1a08cf7110772a49700966fc3a1079eec Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 14:52:35 +0200
Subject: [PATCH 6/7] doc fix

---
 docs/make.jl      | 2 +-
 docs/src/utils.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 0cb021dd..71aa5542 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -47,7 +47,7 @@ makedocs(
             ],
 
         ],
-        "Utis" -> "utils.md",
+        "Utils" => "utils.md",
         "LICENSE.md",
     ],
     strict = true
diff --git a/docs/src/utils.md b/docs/src/utils.md
index a232e161..ecd2791a 100644
--- a/docs/src/utils.md
+++ b/docs/src/utils.md
@@ -1,4 +1,5 @@
 # Utils
+
 ```@docs
 MLDatasets.read_planetoid_data
 ```
\ No newline at end of file

From d40696a8a120285272ae03660b5af8b4122d3549 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 27 Jul 2021 16:24:27 +0200
Subject: [PATCH 7/7] fixes

---
 src/CiteSeer/CiteSeer.jl | 2 +-
 src/Cora/Cora.jl         | 2 +-
 src/PubMed/PubMed.jl     | 2 +-
 test/tst_citeseer.jl     | 6 ++++--
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/CiteSeer/CiteSeer.jl b/src/CiteSeer/CiteSeer.jl
index 9c2596f3..24d35f01 100644
--- a/src/CiteSeer/CiteSeer.jl
+++ b/src/CiteSeer/CiteSeer.jl
@@ -39,7 +39,7 @@ function __init__()
         Dataset: The $DEPNAME dataset.
         Website: $DOCS
         """,
-        joinpath.(LINK, DATA),
+        map(x -> "$LINK/$x", DATA),
         "7f7ec4df97215c573eee316de35754d89382011dfd9fb2b954a4a491057e3eb3",  # if checksum omitted, will be generated by DataDeps
         # post_fetch_method = unpack
     ))
diff --git a/src/Cora/Cora.jl b/src/Cora/Cora.jl
index 2a46050e..01faad31 100644
--- a/src/Cora/Cora.jl
+++ b/src/Cora/Cora.jl
@@ -55,7 +55,7 @@ function __init__()
         Dataset: The $DEPNAME dataset.
         Website: $DOCS
         """,
-        joinpath.(LINK, DATA),
+        map(x -> "$LINK/$x", DATA),
         "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7",  # if checksum omitted, will be generated by DataDeps
         # post_fetch_method = unpack
     ))
diff --git a/src/PubMed/PubMed.jl b/src/PubMed/PubMed.jl
index ac29afa7..ea9ae482 100644
--- a/src/PubMed/PubMed.jl
+++ b/src/PubMed/PubMed.jl
@@ -39,7 +39,7 @@ function __init__()
         Dataset: The $DEPNAME dataset.
         Website: $DOCS
         """,
-        joinpath.(LINK, DATA),
+        map(x -> "$LINK/$x", DATA),
         "0b8bf8e80564611b540655e9cbb8c5900dd3728d4ababe0b990b6f27144bd76c",  # if checksum omitted, will be generated by DataDeps
         # post_fetch_method = unpack
     ))
diff --git a/test/tst_citeseer.jl b/test/tst_citeseer.jl
index 3633f9b7..6a6edc00 100644
--- a/test/tst_citeseer.jl
+++ b/test/tst_citeseer.jl
@@ -15,6 +15,8 @@ end
     @test size(data.test_indices) == (1015,)
     @test size(data.adjacency_list) == (data.num_nodes, )
     @test sum(length.(data.adjacency_list)) == (data.num_edges)
-    @test minimum(minimum.(data.adjacency_list; init=1000)) == 1
-    @test maximum(maximum.(data.adjacency_list; init=1000)) == data.num_nodes
+    if VERSION >= v"1.6.0"
+        @test minimum(minimum.(data.adjacency_list; init=1000)) == 1
+        @test maximum(maximum.(data.adjacency_list; init=1000)) == data.num_nodes
+    end
 end