Merge pull request #70 from JuliaML/cl/cora2

improve Cora + add PubMed and CiteSeer
JuliaML · Jul 27, 2021 · eeb8298 · eeb8298 · CarloLucibello · Jul 27, 2021
2 parents aff7d26 + d40696a
commit eeb8298
Show file tree

Hide file tree

Showing 19 changed files with 424 additions and 46 deletions.
diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
@@ -10,6 +10,8 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+    env:
+      PYTHON: ""
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest

diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
@@ -18,7 +18,8 @@ jobs:
       matrix:
         julia-version: ['1.0', '1', 'nightly']
         os: [ubuntu-latest, windows-latest, macOS-latest]
-
+    env:
+      PYTHON: ""
     steps:
       - uses: actions/[email protected]
       - name: "Set up Julia"

diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "MLDatasets"
 uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
-version = "0.5.8"
+version = "0.5.9"
 
 [deps]
 BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
@@ -10,6 +10,7 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
+PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]
@@ -20,6 +21,7 @@ FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
 GZip = "0.5"
 ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
 MAT = "0.7, 0.8, 0.9, 0.10"
+PyCall = "1"
 Requires = "1"
 julia = "1"
 

diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Each dataset has its own dedicated sub-module.
 Find below a list of available datasets and links to their documentation.
 
 #### Vision
-  - [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/)
+  - [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR10/)
   - [CIFAR100](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/)
   - [EMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/EMNIST/)
   - [FashionMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/FashionMNIST/)
@@ -38,7 +38,9 @@ Find below a list of available datasets and links to their documentation.
   - [UD_English](https://juliaml.github.io/MLDatasets.jl/latest/datasets/UD_English/)
 
 #### Graphs
+  - [CiteSeer](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CiteSeer/)
   - [Cora](https://juliaml.github.io/MLDatasets.jl/latest/datasets/Cora/)
+  - [PubMed](https://juliaml.github.io/MLDatasets.jl/latest/datasets/PubMed/)
 
 
 

diff --git a/docs/make.jl b/docs/make.jl
@@ -19,6 +19,7 @@ makedocs(
     ),
 
     authors = "Hiroyuki Shindo, Christof Stocker",
+    # TODO: automatize `pages` creation
     pages = Any[
         "Home" => "index.md",
         "Available Datasets" => Any[
@@ -40,10 +41,13 @@ makedocs(
             ],
 
             "Graphs" => Any[
+                "CiteSeer" => "datasets/CiteSeer.md",
                 "Cora" => "datasets/Cora.md",
+                "PubMed" => "datasets/PubMed.md",
             ],
 
         ],
+        "Utils" => "utils.md",
         "LICENSE.md",
     ],
     strict = true

diff --git a/docs/src/datasets/CiteSeer.md b/docs/src/datasets/CiteSeer.md
@@ -0,0 +1,11 @@
+# CiteSeer
+
+```@docs
+CiteSeer
+```
+
+## API reference
+
+```@docs
+CiteSeer.dataset
+```
diff --git a/docs/src/datasets/Cora.md b/docs/src/datasets/Cora.md
@@ -7,5 +7,5 @@ Cora
 ## API reference
 
 ```@docs
-Cora.alldata
+Cora.dataset
 ```
diff --git a/docs/src/datasets/PubMed.md b/docs/src/datasets/PubMed.md
@@ -0,0 +1,11 @@
+# PubMed
+
+```@docs
+PubMed
+```
+
+## API reference
+
+```@docs
+PubMed.dataset
+```
diff --git a/docs/src/utils.md b/docs/src/utils.md
@@ -0,0 +1,5 @@
+# Utils
+
+```@docs
+MLDatasets.read_planetoid_data
+```
diff --git a/src/CiteSeer/CiteSeer.jl b/src/CiteSeer/CiteSeer.jl
@@ -0,0 +1,79 @@
+export CiteSeer
+
+
+"""
+    CiteSeer
+
+The CiteSeer citation network dataset from Ref. [1].
+Nodes represent documents and edges represent citation links.
+The dataset is designed for the node classification task. 
+The task is to predict the category of certain paper.
+The dataset is retrieved from Ref. [2].
+
+## Interface
+
+- [`CiteSeer.dataset`](@ref)
+
+## References
+
+[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
+[2]: [Planetoid](https://github.com/kimiyoung/planetoid)
+"""
+module CiteSeer
+
+using DataDeps
+using ..MLDatasets: datafile, read_planetoid_data
+using DelimitedFiles: readdlm
+
+using PyCall
+
+const DEPNAME = "CiteSeer"
+const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
+const DOCS = "https://github.com/kimiyoung/planetoid"
+const DATA = "ind.citeseer." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
+
+function __init__()
+    register(DataDep(
+        DEPNAME,
+        """
+        Dataset: The $DEPNAME dataset.
+        Website: $DOCS
+        """,
+        map(x -> "$LINK/$x", DATA),
+        "7f7ec4df97215c573eee316de35754d89382011dfd9fb2b954a4a491057e3eb3",  # if checksum omitted, will be generated by DataDeps
+        # post_fetch_method = unpack
+    ))
+end
+
+"""
+    dataset(; dir=nothing, reverse_edges=true)
+
+Retrieve the CiteSeer dataset. The output is a named tuple with fields
+```juliarepl
+julia> keys(CiteSeer.dataset())
+(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
+```
+
+In particular, `adjacency_list` is a vector of vector, 
+where `adjacency_list[i]` will contain the neighbors of node `i`
+through outgoing edges.
+
+If `reverse_edges=true`, the graph will contain
+the reverse of each edge and the graph will be undirected.
+
+See also [`CiteSeer`](@ref).
+
+## Usage Examples
+
+```julia
+using MLDatasets: CiteSeer
+data = CiteSeer.dataset()
+train_labels = data.node_labels[data.train_indices]
+```
+"""
+dataset(; dir=nothing, reverse_edges=true) = 
+    read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
+
+
+end #module 
+
diff --git a/src/Cora/Cora.jl b/src/Cora/Cora.jl
@@ -1,27 +1,52 @@
 export Cora
 
+
 """
     Cora
 
-The full Cora citation network dataset from the
-`"Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via
-Ranking" <https://arxiv.org/abs/1707.03815>`_ paper.
+The Cora citation network dataset from Ref. [1].
 Nodes represent documents and edges represent citation links.
+Each node has a predefined feature with 1433 dimensions. 
+The dataset is designed for the node classification task. 
+The task is to predict the category of certain paper.
+The dataset is retrieved from Ref. [2].
+
+## Statistics 
+
+- Nodes: 2708
+- Edges: 10556
+- Number of Classes: 7
+- Label split:
+    - Train:  140
+    - Val:    500
+    - Test:  1000
+
+The split is the one used in the original paper [1] and 
+doesn't consider all nodes.
 
 ## Interface
 
-- [`Cora.alldata`](@ref)
+- [`Cora.dataset`](@ref)
+
+## References
+
+[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
+[2]: [Planetoid](https://github.com/kimiyoung/planetoid
 """
 module Cora
 
 using DataDeps
-using ..MLDatasets: datafile
+using ..MLDatasets: datafile, read_planetoid_data
 using DelimitedFiles: readdlm
 
-const DEPNAME = "Cora"
-const LINK = "http://nrvis.com/download/data/labeled/cora.zip"
-const DOCS = "http://networkrepository.com/cora.php"
+using PyCall
 
+const DEPNAME = "Cora"
+# LINK = "https://github.com/shchur/gnn-benchmark/raw/master/data/npz"
+# LINK = "https://github.com/abojchevski/graph2gauss/raw/master/data/"
+const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
+const DOCS = "https://github.com/kimiyoung/planetoid"
+const DATA = "ind.cora." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
 
 function __init__()
     register(DataDep(
@@ -30,40 +55,41 @@ function __init__()
         Dataset: The $DEPNAME dataset.
         Website: $DOCS
         """,
-        LINK,
-        "a3e3a37c34c9385fe8089bbc7c17ef78ecc3bdf8a4b03b80d02aaa080d9501c8",  # if checksum omitted, will be generated by DataDeps
-        post_fetch_method = unpack
+        map(x -> "$LINK/$x", DATA),
+        "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7",  # if checksum omitted, will be generated by DataDeps
+        # post_fetch_method = unpack
     ))
 end
 
 """
-    alldata(; dir=nothing)
+    dataset(; dir=nothing, reverse_edges=true)
 
 Retrieve the Cora dataset. The output is a named tuple with fields
+```juliarepl
+julia> keys(Cora.dataset())
+(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
+```
+In particular, `adjacency_list` is a vector of vector, 
+where `adjacency_list[i]` will contain the neighbors of node `i`
+through outgoing edges.
 
-- `edges`
-- `node_labels`
-- `directed`
+If `reverse_edges=true`, the graph will contain
+the reverse of each edge and the graph will be undirected.
+
+See also [`Cora`](@ref).
 
 ## Usage Examples
+
 ```juliarepl
-julia> using MLDatasets: Cora
+using MLDatasets: Cora
 
-julia> data = Cora.alldata()
-(edges = [1 9; 1 436; … ; 2708 1390; 2708 2345], node_labels = [3, 6, 5, 5, 4, 4, 7, 3, 3, 7  …  4, 4, 4, 3, 2, 2, 2, 2, 1, 3], directed = true)
+data = Cora.dataset()
+train_labels = data.node_labels[data.train_indices]
 ```
 """
-function alldata(; dir=nothing)
-    edges = readdlm(datafile(DEPNAME, "cora.edges", dir), ',', Int)
-    @assert all(edges[:,3] .== 1)
-    edges = edges[:,1:2]
-
-    node_labels = readdlm(datafile(DEPNAME, "cora.node_labels", dir), ',', Int)
-    node_labels = node_labels[:,2] # first column is just 1:n
-
-    return (; edges=edges, 
-              node_labels=node_labels, 
-              directed=true)
-end
+dataset(; dir=nothing, reverse_edges=true) =
+    read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
+
+
+end #module 
 
-end
diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
@@ -1,7 +1,9 @@
 module MLDatasets
 
 using Requires
+using DelimitedFiles: readdlm
 using FixedPointNumbers, ColorTypes
+using PyCall
 
 bytes_to_type(::Type{UInt8}, A::Array{UInt8}) = A
 bytes_to_type(::Type{N0f8}, A::Array{UInt8}) = reinterpret(N0f8, A)
@@ -45,13 +47,30 @@ include("PTBLM/PTBLM.jl")
 include("UD_English/UD_English.jl")
 
 # Graphs
-include("Cora/Cora.jl")
+include("planetoid.jl")
+    include("Cora/Cora.jl")
+    include("PubMed/PubMed.jl")
+    include("CiteSeer/CiteSeer.jl")
 
 function __init__()
     # initialize optional dependencies
     @require ImageCore="a09fc81d-aa75-5fe9-8630-4744c3626534" begin
         global __images_supported__ = true
     end
+
+    # install scipy if not already there
+    pyimport_conda("scipy", "scipy")
+
+    py"""
+    import pickle
+
+    def pyread_planetoid_file(path, name):
+        out = pickle.load(open(path, "rb"), encoding="latin1")
+        if name == 'graph':
+            return out
+        out = out.todense() if hasattr(out, 'todense') else out
+        return out
+    """
 end
 
 end