# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This package implements some benchmark dataset for graph network
and node representation learning.
"""
import os
import io
import sys
import numpy as np
import pickle as pkl
from pgl import graph
from pgl.utils.logger import log
__all__ = [
"CitationDataset",
"CoraDataset",
"ArXivDataset",
"BlogCatalogDataset",
]
def get_default_data_dir(name):
"""Get data path name"""
dir_path = os.path.abspath(os.path.dirname(__file__))
dir_path = os.path.join(dir_path, 'data')
filepath = os.path.join(dir_path, name)
return filepath
def _pickle_load(pkl_file):
"""Load pickle"""
if sys.version_info > (3, 0):
return pkl.load(pkl_file, encoding='latin1')
else:
return pkl.load(pkl_file)
def _parse_index_file(filename):
"""Parse index file."""
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
[docs]class CitationDataset(object):
"""Citation dataset helps to create data for citation dataset (Pubmed and Citeseer)
Args:
name: The name for the dataset ("pubmed" or "citeseer")
symmetry_edges: Whether to create symmetry edges.
self_loop: Whether to contain self loop edges.
Attributes:
graph: The :code:`Graph` data object
y: Labels for each nodes
num_classes: Number of classes.
train_index: The index for nodes in training set.
val_index: The index for nodes in validation set.
test_index: The index for nodes in test set.
"""
def __init__(self, name, symmetry_edges=True, self_loop=True):
self.path = get_default_data_dir(name)
self.symmetry_edges = symmetry_edges
self.self_loop = self_loop
self.name = name
self._load_data()
def _load_data(self):
"""Load data
"""
import networkx as nx
objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(objnames)):
with open("{}/ind.{}.{}".format(self.path, self.name, objnames[i]),
'rb') as f:
objects.append(_pickle_load(f))
x, y, tx, ty, allx, ally, _graph = objects
test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format(
self.path, self.name))
test_idx_range = np.sort(test_idx_reorder)
allx = allx.todense()
tx = tx.todense()
if self.name == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(
min(test_idx_reorder), max(test_idx_reorder) + 1)
tx_extended = np.zeros(
(len(test_idx_range_full), x.shape[1]), dtype="float32")
tx_extended[test_idx_range - min(test_idx_range), :] = tx
tx = tx_extended
ty_extended = np.zeros(
(len(test_idx_range_full), y.shape[1]), dtype="float32")
ty_extended[test_idx_range - min(test_idx_range), :] = ty
ty = ty_extended
features = np.vstack([allx, tx])
features[test_idx_reorder, :] = features[test_idx_range, :]
features = features / (np.sum(features, axis=-1) + 1e-15)
features = np.array(features, dtype="float32")
_graph = nx.DiGraph(nx.from_dict_of_lists(_graph))
onehot_labels = np.vstack((ally, ty))
onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :]
labels = np.argmax(onehot_labels, 1)
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_val = range(len(y), len(y) + 500)
all_edges = []
for i in _graph.edges():
u, v = tuple(i)
all_edges.append((u, v))
if self.symmetry_edges:
all_edges.append((v, u))
if self.self_loop:
for i in range(_graph.number_of_nodes()):
all_edges.append((i, i))
all_edges = list(set(all_edges))
self.graph = graph.Graph(
num_nodes=_graph.number_of_nodes(),
edges=all_edges,
node_feat={"words": features})
self.y = np.array(labels, dtype="int64")
self.num_classes = onehot_labels.shape[1]
self.train_index = np.array(idx_train, dtype="int32")
self.val_index = np.array(idx_val, dtype="int32")
self.test_index = np.array(idx_test, dtype="int32")
[docs]class CoraDataset(object):
"""Cora dataset implementation
Args:
symmetry_edges: Whether to create symmetry edges.
self_loop: Whether to contain self loop edges.
Attributes:
graph: The :code:`Graph` data object
y: Labels for each nodes
num_classes: Number of classes.
train_index: The index for nodes in training set.
val_index: The index for nodes in validation set.
test_index: The index for nodes in test set.
"""
def __init__(self, symmetry_edges=True, self_loop=True):
self.path = get_default_data_dir("cora")
self.symmetry_edges = symmetry_edges
self.self_loop = self_loop
self._load_data()
def _load_data(self):
"""Load data"""
content = os.path.join(self.path, 'cora.content')
cite = os.path.join(self.path, 'cora.cites')
node_feature = []
paper_ids = []
y = []
y_dict = {}
with open(content, 'r') as f:
for line in f:
line = line.strip().split()
paper_id = int(line[0])
paper_class = line[-1]
if paper_class not in y_dict:
y_dict[paper_class] = len(y_dict)
feature = [int(i) for i in line[1:-1]]
feature_array = np.array(feature, dtype="float32")
# Normalize
feature_array = feature_array / (np.sum(feature_array) + 1e-15)
node_feature.append(feature_array)
y.append(y_dict[paper_class])
paper_ids.append(paper_id)
paper2vid = dict([(v, k) for (k, v) in enumerate(paper_ids)])
num_nodes = len(paper_ids)
node_feature = np.array(node_feature, dtype="float32")
all_edges = []
with open(cite, 'r') as f:
for line in f:
u, v = line.split()
u = paper2vid[int(u)]
v = paper2vid[int(v)]
all_edges.append((u, v))
if self.symmetry_edges:
all_edges.append((v, u))
if self.self_loop:
for i in range(num_nodes):
all_edges.append((i, i))
all_edges = list(set(all_edges))
self.graph = graph.Graph(
num_nodes=num_nodes,
edges=all_edges,
node_feat={"words": node_feature})
perm = np.arange(0, num_nodes)
#np.random.shuffle(perm)
self.train_index = perm[:140]
self.val_index = perm[200:500]
self.test_index = perm[500:1500]
self.y = np.array(y, dtype="int64")
self.num_classes = len(y_dict)
[docs]class BlogCatalogDataset(object):
"""BlogCatalog dataset implementation
Args:
symmetry_edges: Whether to create symmetry edges.
self_loop: Whether to contain self loop edges.
Attributes:
graph: The :code:`Graph` data object.
num_groups: Number of classes.
train_index: The index for nodes in training set.
test_index: The index for nodes in validation set.
"""
def __init__(self, symmetry_edges=True, self_loop=False):
self.path = get_default_data_dir("BlogCatalog")
self.num_groups = 39
self.symmetry_edges = symmetry_edges
self.self_loop = self_loop
self._load_data()
def _load_data(self):
edge_path = os.path.join(self.path, 'edges.csv')
node_path = os.path.join(self.path, 'nodes.csv')
group_edge_path = os.path.join(self.path, 'group-edges.csv')
all_edges = []
with io.open(node_path) as inf:
num_nodes = len(inf.readlines())
node_feature = np.zeros((num_nodes, self.num_groups))
with io.open(group_edge_path) as inf:
for line in inf:
node_id, group_id = line.strip('\n').split(',')
node_id, group_id = int(node_id) - 1, int(group_id) - 1
node_feature[node_id][group_id] = 1
with io.open(edge_path) as inf:
for line in inf:
u, v = line.strip('\n').split(',')
u, v = int(u) - 1, int(v) - 1
all_edges.append((u, v))
if self.symmetry_edges:
all_edges.append((v, u))
if self.self_loop:
for i in range(num_nodes):
all_edges.append((i, i))
all_edges = list(set(all_edges))
self.graph = graph.Graph(
num_nodes=num_nodes,
edges=all_edges,
node_feat={"group_id": node_feature})
perm = np.arange(0, num_nodes)
np.random.shuffle(perm)
train_num = int(num_nodes * 0.5)
self.train_index = perm[:train_num]
self.test_index = perm[train_num:]
[docs]class ArXivDataset(object):
"""ArXiv dataset implementation
Args:
np_random_seed: The random seed for numpy.
Attributes:
graph: The :code:`Graph` data object.
"""
def __init__(self, np_random_seed=123):
self.path = get_default_data_dir("arXiv")
self.np_random_seed = np_random_seed
self._load_data()
def _load_data(self):
np.random.seed(self.np_random_seed)
edge_path = os.path.join(self.path, 'ca-AstroPh.txt')
bi_edges = set()
self.neg_edges = []
self.pos_edges = []
self.node2id = dict()
def node_id(node):
if node not in self.node2id:
self.node2id[node] = len(self.node2id)
return self.node2id[node]
with io.open(edge_path) as inf:
for _ in range(4):
inf.readline()
for line in inf:
u, v = line.strip('\n').split('\t')
u, v = node_id(u), node_id(v)
if u < v:
bi_edges.add((u, v))
else:
bi_edges.add((v, u))
num_nodes = len(self.node2id)
while len(self.neg_edges) < len(bi_edges) // 2:
random_edges = np.random.choice(num_nodes, [len(bi_edges), 2])
for (u, v) in random_edges:
if u != v and (u, v) not in bi_edges and (v, u
) not in bi_edges:
self.neg_edges.append((u, v))
if len(self.neg_edges) == len(bi_edges) // 2:
break
bi_edges = list(bi_edges)
np.random.shuffle(bi_edges)
self.pos_edges = bi_edges[:len(bi_edges) // 2]
bi_edges = bi_edges[len(bi_edges) // 2:]
all_edges = []
for edge in bi_edges:
u, v = edge
all_edges.append((u, v))
all_edges.append((v, u))
self.graph = graph.Graph(num_nodes=num_nodes, edges=all_edges)