"""The core module of DeepGraph (dg).
This module contains the core class ``dg.DeepGraph`` providing the means
to construct, manipulate and partition graphs, and offering interfacing
methods to common network representations and popular Python network
packages. This class also provides plotting methods to visualize graphs
and their properties and to benchmark the graph construction parameters.
For further information type
>>> help(dg.DeepGraph)
"""
import inspect
# Copyright (C) 2017-2023 by
# Dominik Traxl <dominik.traxl@posteo.org>
# All rights reserved.
# BSD license.
import os
from datetime import datetime
from itertools import chain
from deepgraph.iterators_and_indexers import (
_matrix_iterator,
_ft_iterator,
_iter_edges,
_initiate_create_edges,
_aggregate_super_table,
)
from deepgraph.utils import _is_array_like, _dic_translator, _create_bin_edges, _flatten
try:
import matplotlib as mpl
display = "DISPLAY" in os.environ
if not display:
mpl.use("Agg")
import matplotlib.pyplot as plt
except ImportError:
mpl = None
plt = None
import numpy as np
import pandas as pd
# get rid of false positive SettingWithCopyWarnings, see
# http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning
pd.options.mode.chained_assignment = None
[docs]class DeepGraph:
"""The core class of DeepGraph (dg).
This class encapsulates the graph representation as ``pandas.DataFrame``
objects in its attributes ``v`` and ``e``. It can be initialized with a
node table ``v``, whose rows represent the nodes of the graph, as well
as an edge table ``e``, whose rows represent edges between the nodes.
Given a node table ``v``, it provides methods to iteratively compute
pairwise relations between the nodes using arbitrary, user-defined
functions. These methods provide arguments to parallelize the
computation and control memory consumption (see ``create_edges`` and
``create_edges_ft``).
Also provides methods to partition nodes, edges or an entire graph by
the graph's properties and labels, and to create common network
representations and graph objects of popular Python network packages.
Furthermore, it provides methods to visualize graphs and their
properties and to benchmark the graph construction parameters.
Optionally, the convenience parameter ``supernode_labels_by`` can be
passed, creating supernode labels by enumerating all distinct (tuples
of) values of a (multiple) column(s) of ``v`` . Superedge labels can be
created analogously, by passing the parameter ``superedge_labels_by``.
Parameters
----------
v : pandas.DataFrame or pandas.HDFStore, optional (default=None)
The node table, a table representation of the nodes of a graph. The
index of ``v`` must be unique and represents the node indices. The
column names of ``v`` represent the types of features of the nodes,
and each cell represents a feature of a node. Only a reference to
the input DataFrame is created, not a copy. May also be a
``pandas.HDFStore``, but only ``create_edges`` and
``create_edges_ft`` may then be used (so far).
e : pandas.DataFrame, optional (default=None)
The edge table, a table representation of the edges between the
nodes given by ``v``. Its index has to be a
``pandas.core.index.MultiIndex``, whose first level contains the
indices of the source nodes, and the second level contains the
indices of the target nodes. Each row of ``e`` represents an edge,
column names of ``e`` represent the types of relations of the edges,
and each cell in ``e`` represents a relation of an edge. Only a
reference to the input DataFrame is created, not a copy.
supernode_labels_by : dict, optional (default=None)
A dictionary whose keys are strings and their values are (lists of)
column names of ``v``. Appends a column to ``v`` for each key, whose
values correspond to supernode labels, enumerating all distinct
(tuples of) values of the column(s) given by the dict's value.
superedge_labels_by : dict, optional (default=None)
A dictionary whose keys are strings and their values are (lists of)
column names of ``e``. Appends a column to ``e`` for each key, whose
values correspond to superedge labels enumerating all distinct
(tuples of) values of the column(s) given by the dict's value.
Attributes
----------
v : pandas.DataFrame
See Parameters.
e : pandas.DataFrame
See Parameters.
n : int
Property: Number of nodes.
m : int
Property: Number of edges.
f : pd.DataFrame
Property: types of features and number of features of corresponding
type.
r : pd.DataFrame
Property: types of relations and number of relations of
corresponding type.
"""
[docs] def __init__(self, v=None, e=None, supernode_labels_by=None, superedge_labels_by=None):
# create supernode labels by common features
if supernode_labels_by is not None:
for key, value in supernode_labels_by.items():
v[key] = v.groupby(value).grouper.group_info[0]
# create superedge labels by common relations
if superedge_labels_by is not None:
for key, value in superedge_labels_by.items():
e[key] = e.groupby(value).grouper.group_info[0]
# assert v input, set as class attribute
if v is not None:
# assert (isinstance(v, pd.DataFrame) or
# isinstance(v, pd.HDFStore)), (
# "v has to be <type 'pd.DataFrame'> "
# "or <type 'pd.HDFStore'>, not {}".format(type(v)))
self.v = v
# assert e input, set as class attribute
if e is not None:
# assert isinstance(e, pd.DataFrame), (
# "e has to be <type 'pd.DataFrame'>, not {}".format(type(e)))
self.e = e
def __repr__(self):
msg = "<{} object, with n={} node(s) and m={} edge(s) at 0x{:02x}>"
return msg.format(type(self).__name__, self.n, self.m, id(self))
def __str__(self):
msg = "<{} object, with n={} node(s) and m={} edge(s) at 0x{:02x}>"
return msg.format(type(self).__name__, self.n, self.m, id(self))
[docs] def create_edges(
self,
connectors=None,
selectors=None,
transfer_features=None,
r_dtype_dic=None,
no_transfer_rs=None,
step_size=int(1e7),
from_pos=0,
to_pos=None,
hdf_key=None,
verbose=False,
logfile=None,
):
"""Create an edge table ``e`` linking the nodes in ``v``.
This method enables an iterative computation of pairwise relations
(edges) between the nodes represented by ``v``. It does so in a
flexible, efficient and vectorized fashion, easily parallelizable and
with full control over RAM usage.
1. Connectors
The simplest use-case is to define a single connector function
acting on a single column of the node table ``v``. For instance, given
a node table ``v``
>>> import pandas as pd
>>> import deepgraph as dg
>>> v = pd.DataFrame({'time': [0.,2.,9.], 'x': [3.,1.,12.]})
>>> g = dg.DeepGraph(v)
>>> g.v
time x
0 0 3
1 2 1
2 9 12
one may define a function
>>> def time_difference(time_s, time_t):
... dt = time_t - time_s
... return dt
and pass it to ``create_edges``, in order to compute the time
difference of each pair of nodes
>>> g.create_edges(connectors=time_difference)
>>> g.e
dt
s t
0 1 2
2 9
1 2 7
As one can see, the connector function takes column names of ``v`` with
additional '_s' and '_t' endings (indicating source node values and
target node values, respectively) as input, and returns a variable with
the computed values. The resulting edge table ``g.e`` is indexed by the
node indices ('s' and 't', representing source and target node indices,
respectively), and has one column ('dt', the name of the returned
variable) with the computed values of the given connector. Note that
only the upper triangle adjacency matrix is computed, which is always
the case. See Notes for further information.
One may also pass a list of functions to ``connectors``, which are then
computed in the list's order. Generally, a connector function can take
multiple column names of ``v`` (with '_s' and/or '_t' appended) as
input, as well as already computed relations of former connectors.
Also, any connector function may have multiple output variables. Every
output variable has to be a 1-dimensional ``np.ndarray`` (with
arbitrary dtype, including ``object``). The return statement may not
contain any operators, only references to each computed relation.
For instance, considering the above example, one may define an
additional connector
>>> def velocity(dt, x_s, x_t):
... dx = x_t - x_s
... v = dx / dt
... return v, dx
and then apply both connectors on ``v``, resulting in
>>> g.create_edges(connectors=[time_difference, velocity])
>>> g.e
dt dx v
s t
0 1 2 -2 -1.000000
2 9 9 1.000000
1 2 7 11 1.571429
2. Selectors
However, one is often only interested in a subset of all possible
edges. In order to select edges during the iteration process - based on
some conditions on the node's features and their computed relations -
one may pass a (list of) selector function(s) to ``create_edges``. For
instance, given the above example, one may define a selector
>>> def dt_thresh(dt, sources, targets):
... sources = sources[dt > 5]
... targets = targets[dt > 5]
... return sources, targets
and apply it in conjunction with the ``time_difference`` connector
>>> g.create_edges(connectors=time_difference, selectors=dt_thresh)
>>> g.e
dt
s t
0 2 9
1 2 7
leaving only edges with a time difference larger than 5.
Every selector function must have ``sources`` and ``targets`` as input
arguments as well as in the return statement. Most generally, they may
depend on column names of ``v`` (with '_s' and/or '_t' appended) and/or
computed relations of connector functions, and/or computed relations of
former selector functions. Apart from ``sources`` and ``targets``, they
may additionally return computed relations. Given this input/output
flexibility of selectors, one could in fact compute all required
relations, and select any desired subset of edges, with a single
selector function. The purpose of splitting connectors and/or
selectors, however, is to control the iteration's performance by
consecutively computing relations and selecting edges: **hierarchical
selection**.
3. Hierarchical Selection
As the algorithm iterates through the chunks of all possible source and
target node indices ([0, g.n*(g.n-1)/2]), it goes through the list of
``selectors`` at each step. If a selector has a relation as input, it
must have either been computed by a former selector, or the selector
requests its computation by the corresponding connector function in
``connectors`` (this connector may not depend on any other not yet
computed relations). Once the input relations are computed (if
requested), the selector is applied and returns updated indices, which
are then passed to the next selector. Hence, with each selector, the
indices are reduced and consecutive computation of relations only
consider the remaining indices. After all selectors have been applied,
the connector functions that have not been requested by any selector
are computed (on the final, reduced chunk of node and target indices).
4. Transferring Features
The argument ``transfer_features``, which takes a (list of) column
name(s) of ``v``, makes it possible to transfer features of ``v`` to
the created edge table ``e``
>>> g.create_edges(connectors=time_difference,
... transfer_features=['x', 'time'])
>>> g.e
dt time_s time_t x_s x_t
s t
0 1 2 0 2 3 1
2 9 0 9 3 12
1 2 7 2 9 1 12
If computation time and memory consumption are of no concern, one might
skip the remaing paragraphs.
5. Logging
Clearly, the order of the hierarchical selection as described in 3.
influences the computation's efficiency. The complexity of a relation's
computation and the (expected average) number of deleted edges of a
selector should be considered primarily. In order to track and
benchmark the iteration process, the progress and time measurements are
printed for each iteration step, if ``verbose`` is set to True.
Furthermore, one may create a logfile (which can also be plot by
``dg.DeepGraph.plot_logfile``) by setting the argument ``logfile`` to a
string, indicating the file name of the created logfile.
6. Parallelization and Memory Control
The arguments ``from_pos``, ``to_pos`` and ``step_size`` control the
range of processed pairs of nodes and the number of pairs of nodes to
process at each iteration step. They may be used for parallel
computation and to control RAM usage. See Parameters for details.
It is also possible to initiate ``dg.DeepGraph`` with a
``pandas.HDFStore`` containing the DataFrame representing the node
table. Only the data requested by ``transfer_features`` and the user-
defined ``connectors`` and ``selectors`` at each iteration step is then
pulled from the store, which is particularly useful for large node
tables and parallel computation. The only requirement is that the node
table contained in the store is in table(t) format, not fixed(f)
format. For instance, considering the above created node table, one may
store it in a hdf file
>>> vstore = pd.HDFStore('vstore.h5')
>>> vstore.put('node_table', v, format='t', index=False)
initiate a DeepGraph instance with the store
>>> g = dg.DeepGraph(vstore)
>>> g.v
<class 'pandas.io.pytables.HDFStore'>
File path: vstore.h5
/node_table frame_table (typ->appendable,nrows->3,ncols->2,
indexers->[index])
and then create edges the same way as if ``g.v`` were a DataFrame
>>> g.create_edges(connectors=time_difference)
>>> g.e
dt
s t
0 1 2
2 9
1 2 7
In case the store has multiple nodes, ``hdf_key`` has to be set to the
node corresponding to the node table of the graph.
Also, one may pass a (list of) name(s) of computed relations,
``no_transfer_rs``, which should not be transferred to the created edge
table ``e``. This can be advantageous, for instance, if a selector
depends on computed relations that are of no further interest.
Furthermore, it is possible to force the dtype of computed relations
with the argument ``r_dtype_dic``. The dtype of a relation is then set
at each iteration step, but **after** all selectors and connectors were
processed.
7. Creating Edges on a Fast Track
If the selection of edges includes a simple distance threshold, i.e. a
selector function defined as follows:
>>> def ft_selector(x_s, x_t, threshold, sources, targets):
... dx = x_t - x_s
... sources = sources[dx <= threshold]
... targets = targets[dx <= threshold]
... return sources, targets, dx
the method ``create_edges_ft`` should be considered, since it provides
a much faster iteration algorithm.
Parameters
----------
connectors : function or array_like, optional (default=None)
User defined connector function(s) that compute pairwise relations
between the nodes in ``v``. A connector accepts multiple column
names of ``v`` (with '_s' and/or '_t' appended, indicating source
node values and target node values, respectively) as input, as well
as already computed relations of former connectors. A connector
function may have multiple output variables. Every output variable
has to be a 1-dimensional ``np.ndarray`` (with arbitrary dtype,
including ``object``). See above and ``dg.functions`` for examplary
connector functions.
selectors : function or array_like, optional (default=None)
User defined selector function(s) that select edges during the
iteration process, based on some conditions on the node's features
and their computed relations. Every selector function must have
``sources`` and ``targets`` as input arguments as well as in the
return statement. A selector may depend on column names of ``v``
(with '_s' and/or '_t' appended) and/or computed relations of
connector functions, and/or computed relations of former selector
functions. Apart from ``sources`` and ``targets``, they may also
return computed relations (see connectors). See above, and
``dg.functions`` for exemplary selector functions.
transfer_features : str, int or array_like, optional (default=None)
A (list of) column name(s) of ``v``, indicating which features of
``v`` to transfer to ``e`` (appending '_s' and '_t' to the column
names of ``e``, indicating source and target node features,
respectively).
r_dtype_dic : dict, optional (default=None)
A dictionary with names of computed relations of connectors and/or
selectors as keys and dtypes as values. Forces the data types of
the computed relations in ``e`` during the iteration (but **after**
all selectors and connectors were processed), otherwise infers
them.
no_transfer_rs : str or array_like, optional (default=None)
Name(s) of computed relations that are not to be transferred to the
created edge table ``e``. Can be used to save memory, e.g., if a
selector depends on computed relations that are of no interest
otherwise.
step_size : int, optional (default=1e6)
The number of pairs of nodes to process at each iteration step.
Must be in [ 1, g.n*(g.n-1)/2 ]. Its value determines computation
speed and memory consumption.
from_pos : int, optional (default=0)
Determines from which pair of nodes to start the iteration process.
Must be in [ 0, g.n*(g.n-1)/2 [. May be used in conjuction with
``to_pos`` for parallel computation.
to_pos : positive integer, optional (default=None)
Determines at which pair of nodes to stop the iteration process
(the endpoint is excluded). Must be in [ 1, g.n*(g.n-1)/2 ] and
larger than ``from_pos``. Defaults to None, which translates to the
last pair of nodes, g.n*(g.n-1)/2. May be used in conjunction with
``from_pos`` for parallel computation.
hdf_key : str, optional (default=None)
If you initialized ``dg.DeepGraph`` with a ``pandas.HDFStore`` and
the store has multiple nodes, you must pass the key to the node in
the store that corresponds to the node table.
verbose : bool, optional (default=False)
Whether to print information at each step of the iteration process.
logfile : str, optional (default=None)
Create a log-file named by ``logfile``. Contains the time and date
of the method's call, the input arguments and time mesaurements for
each iteration step. A plot of ``logfile`` can be created by
``dg.DeepGraph.plot_logfile``.
Returns
-------
e : pd.DataFrame
Set the created edge table ``e`` as attribute of ``dg.DeepGraph``.
See also
--------
create_edges_ft
Notes
-----
1. Input and output data types
Since connectors (and selectors) take columns of a pandas DataFrame as
input, there are no restrictions on the data types of which pairwise
relations are computed. In the most general case, a DataFrame's column
has ``object`` as dtype, and its values may then be arbitrary Python
objects. The same goes for the output variables of connectors (and
selectors). The only requirement is that each ouput variable is
1-dimensional.
However, it is also possible to use the values of a column of ``v`` as
references to arbitrary objects, which may sometimes be more
convenient. In case a connector (or selector) needs the node's original
indices as input, one may simply copy them to a column, e.g.
>>> v['indices'] = v.index
and then define the connector's (or selector's) input arguments
accordingly.
2. Connectors and selectors
The only requirement on connectors and selectors is that their input
arguments and return statements are consistent with the column names of
``v`` and the passing of computed relations (see above, 3. Hierarchical
Selection).
Whatever happens inside the functions is entirely up to the user. This
means, for instance, that one may wrap arbitrary functions within a
connector (selector), such as optimized C functions or existing
functions whose input/output is not consistent with the
``create_edges`` method (see, e.g., the methods provided in
``dg.functions``, ``scipy`` or scikit learn's ``sklearn.metrics`` and
``sklearn.neighbors.DistanceMetric``). One could also store a
connector's (selector's) computations directly within the function, or
let the function print out any desired information during iteration.
3. Why not compute the full adjacency matrix?
This is due to efficiency. For any asymmetric function (i.e., f(s, t)
!= f(t, s)), one can always create an additional connector (or output
variable) that computes the mirrored values of that function.
"""
# logging
if logfile:
_, _, _, argvalues = inspect.getargvalues(inspect.currentframe())
with open(logfile, "w") as log:
print("# LOG FILE", file=log)
print("# function call on: {}".format(datetime.now()), file=log)
print("#", file=log)
print("# Parameters", file=log)
print("# ----------", file=log)
for arg, value in argvalues.items():
print("# ", (arg, value), end="", file=log)
print("", file=log)
print("#", file=log)
print("# Iterations", file=log)
print("# ----------", file=log)
print("# max_pairs exceeded(1) | nr.of pairs | nr.of edges | " "comp.time(s)\n", file=log)
# measure performance
start_generation = datetime.now()
# v shortcut
v = self.v
# adjust keywords
min_chunk_size = step_size
ft_feature = None
# create empty transfer features list if not given
if transfer_features is None:
transfer_features = []
elif not _is_array_like(transfer_features):
transfer_features = [transfer_features]
# hdf_key
if isinstance(v, pd.HDFStore) and hdf_key is None:
assert len(v.keys()) == 1, (
"hdf store has multiple nodes, hdf_key corresponding to the " " node table has to be passed."
)
hdf_key = self.v.keys()[0]
# initialize
coldtypedic, verboseprint = _initiate_create_edges(
verbose, v, ft_feature, connectors, selectors, r_dtype_dic, transfer_features, no_transfer_rs, hdf_key
)
# iteratively create link data frame (matrix iterator)
self.e = _matrix_iterator(
v, min_chunk_size, from_pos, to_pos, coldtypedic, transfer_features, verboseprint, logfile, hdf_key
)
# performance
deltat = datetime.now() - start_generation
verboseprint("")
verboseprint(
"computation time of function call:",
"\ts =",
int(deltat.total_seconds()),
"\tms =",
str(deltat.microseconds / 1000.0)[:6],
"\n",
)
[docs] def create_edges_ft(
self,
ft_feature,
connectors=None,
selectors=None,
transfer_features=None,
r_dtype_dic=None,
no_transfer_rs=None,
min_chunk_size=1000,
max_pairs=int(1e7),
from_pos=0,
to_pos=None,
hdf_key=None,
verbose=False,
logfile=None,
):
"""Create (ft) an edge table ``e`` linking the nodes in ``v``.
This method implements the same functionalities as ``create_edges``,
with the difference of providing a much quicker iteration algorithm
based on a so-called fast-track feature. It is advised to read the
docstring of ``create_edges`` before this one, since only the
differences are explained in the following.
Apart from the hierarchical selection through ``connectors`` and
``selectors`` as described in the method ``create_edges`` (see 1.-3.),
this method necessarily includes the (internal) selector function
>>> def ft_selector(ftf_s, ftf_t, ftt, sources, targets):
... ft_r = ftf_t - ftf_s
... sources = sources[ft_r <= ftt]
... targets = targets[ft_r <= ftt]
... return sources, targets, ft_r
where ``ftf`` is the fast-track feature (a column name of ``v``),
``ftt`` the fast-track threshold (a positive number), and ft_r the
computed fast-track relation. The argument ``ft_feature``, which has
to be a tuple (``ftf``, ``ftt``), determines these variables.
1. The Fast-Track Feature
The simplest use-case, therefore, is to only pass ``ft_feature``. For
instance, given a node table
>>> import pandas as pd
>>> import deepgraph as dg
>>> v = pd.DataFrame({'time': [-3.6,-1.1,1.4,4., 6.3],
... 'x': [-3.,3.,1.,12.,7.]})
>>> g = dg.DeepGraph(v)
>>> g.v
time x
0 -3.6 -3
1 -1.1 3
2 1.4 1
3 4.0 12
4 6.3 7
one may create and select edges by
>>> g.create_edges_ft(ft_feature=('time', 5))
>>> g.e
ft_r
s t
0 1 2.5
2 5.0
1 2 2.5
2 3 2.6
4 4.9
3 4 2.3
leaving only edges with a time difference smaller than (or equal to)
``ftt`` = 5. Note that the node table always has to be sorted by the
fast-track feature. This is due to the fact that the algorithm only
processes pairs of nodes whose fast-track relation is smaller than (or
equal to) the fast-track threshold, and the (pre)determination of these
pairs relies on a sorted DataFrame.
2. Hierarchical Selection
Additionally, one may define ``connectors`` and ``selectors`` as
described in ``create_edges`` (see 1.-3.). Per default, the (internal)
fast-track selector is applied first. It's order of application,
however, may be determined by inserting the string 'ft_selector' in the
desired position of the list of ``selectors``.
The remaining arguments are as described in ``create_edges``, apart
from ``min_chunk_size``, ``max_pairs``, ``from_pos`` and ``to_pos``. If
computation time and/or memory consumption are a concern, one may
therefore read the remaining paragraph.
3. Parallelization and Memory Control on a FastTrack
At each iteration step, the algorithm takes a number of nodes (n =
``min_chunk_size``, per default n=1000) and computes the fast track
relation (distance) between the last node and the first node, d_ftf =
ftf_last - ftf_first. In case d_ftf > ``ftt``, all nodes with a fast-
track feature < ftf_last - ``ftt`` are considered source nodes, and
their relations with all n nodes are computed (hierarchical selection).
In case d_ftf <= ``ftt``, n is increased, s.t. d_ftf > ``ftt``. This
might lead to a large number of pairs of nodes to process at a given
iteration step. In order to control memory consumption, one might
therefore set ``max_pairs`` to a suitable value, triggering a
subiteration if this value is exceeded.
In order to parallelize the iterative computation, one may pass the
arguments ``from_pos`` and ``to_pos``. They determine the range of
**source nodes** to process (endpoint excluded). Hence, ``from_pos``
has to be in [0, g.n[, and ``to_pos`` in [1,g.n]. For instance, given
the node table above
>>> g.v
time x
0 -3.6 -3
1 -1.1 3
2 1.4 1
3 4.0 12
4 6.3 7
we can compute all relations of the source nodes in [1,3[ by
>>> g.create_edges_ft(ft_feature=('time', 5), from_pos=1, to_pos=3)
>>> g.e
ft_r
s t
1 2 2.5
2 3 2.6
4 4.9
Like ``create_edges``, this method also works with a ``pd.HDFStore``
containing the DataFrame representing the node table. Only the data
requested by ``ft_feature``, ``transfer_features`` and the user-defined
``connectors`` and ``selectors`` at each iteration step is then pulled
from the store. The node table in the store has to be in table(t)
format, and additionally, the fast_track feature has to be a data
column. For instance, storing the above node table
>>> vstore = pd.HDFStore('vstore.h5')
>>> vstore.put('node_table', v, format='t', data_columns=True,
... index=False)
one may initiate a DeepGraph instance with the store
>>> g = dg.DeepGraph(vstore)
>>> g.v
<class 'pandas.io.pytables.HDFStore'>
File path: vstore.h5
/node_table frame_table (typ->appendable,nrows->5,ncols->2,
indexers->[index],dc->[time,x])
and then create edges the same way as if ``g.v`` were a DataFrame
>>> g.create_edges_ft(ft_feature=('time', 5), from_pos=1, to_pos=3)
>>> g.e
ft_r
s t
1 2 2.5
2 3 2.6
4 4.9
.. warning:: There is no assertion whether the node table in a store is
sorted by the fast-track feature! The result of an
unsorted table is unpredictable, and generally not
correct.
Parameters
----------
ft_feature : tuple
A tuple (ftf, ftt), where ftf is a column name of ``v`` (the fast-
track feature) and ftt a positive number (the fast-track
threshold). The fast-track feature may contain integers or floats,
but datetime-like values are also accepted. In that case,
``ft_feature`` has to be a tuple of length 3, (ftf, ftt, dt_unit),
where dt_unit is on of {'D','h','m','s','ms','us','ns'}:
- `D`: days
- `h`: hours
- `m`: minutes
- `s`: seconds
- `ms`: milliseconds
- `us`: microseconds
- `ns`: nanoseconds
determining the unit in which the temporal distance is measured.
The variable name of the fast-track relation transferred to ``e``
is ``ft_r``.
connectors : function or array_like, optional (default=None)
User defined connector function(s) that compute pairwise relations
between the nodes in ``v``. A connector accepts multiple column
names of ``v`` (with '_s' and/or '_t' appended, indicating source
node values and target node values, respectively) as input, as well
as already computed relations of former connectors. A connector
function may have multiple output variables. Every output variable
has to be a 1-dimensional ``np.ndarray`` (with arbitrary dtype,
including ``object``). A connector may also depend on the fast-
track relations ('ft_r'). See ``dg.functions`` for examplary
connector functions.
selectors : function or array_like, optional (default=None)
User defined selector function(s) that select edges during the
iteration process, based on some conditions on the node's features
and their computed relations. Every selector function must have
``sources`` and ``targets`` as input arguments as well as in the
return statement. A selector may depend on column names of ``v``
(with '_s' and/or '_t' appended) and/or computed relations of
connector functions, and/or computed relations of former selector
functions. Apart from ``sources`` and ``targets``, they may also
return computed relations (see connectors). A selector may also
depend on the fast-track relations ('ft_r'). See ``dg.functions``
for exemplary selector functions.
Note: To specify the hierarchical order of the selection by the
fast-track selector, insert the string 'ft_selector' in the
corresponding position of the ``selectors`` list. Otherwise,
computation of ft_r and selection by the fast-track selector is
carried out first.
transfer_features : str, int or array_like, optional (default=None)
A (list of) column name(s) of ``v``, indicating which features of
``v`` to transfer to ``e`` (appending '_s' and '_t' to the column
names of ``e``, indicating source and target node features,
respectively).
r_dtype_dic : dict, optional (default=None)
A dictionary with names of computed relations of connectors and/or
selectors as keys and dtypes as values. Forces the data types of
the computed relations in ``e`` during the iteration (but **after**
all selectors and connectors were processed), otherwise infers
them.
no_transfer_rs : str or array_like, optional (default=None)
Name(s) of computed relations that are not to be transferred to the
created edge table ``e``. Can be used to save memory, e.g., if a
selector depends on computed relations that are of no interest
otherwise.
min_chunk_size : int, optional (default=1000)
The minimum number of nodes to form pairs of at each iteration
step. See above for details.
max_pairs : positive integer, optional (default=1e6)
The maximum number of pairs of nodes to process at any given
iteration step. If the number is exceeded, a memory saving
subiteration is applied.
from_pos : int, optional (default=0)
The locational index (.iloc) of ``v`` to start the iteration.
Determines the range of **source nodes** to process, in conjuction
with ``to_pos``. Has to be in [0, g.n[, and smaller than
``to_pos``. See above for details and an example.
to_pos : int, optional (default=None)
The locational index (.iloc) of ``v`` to end the iteration
(excluded). Determines the range of **source nodes** to process, in
conjuction with ``from_pos``. Has to be in [1, g.n], and larger
than ``from_pos``. Defaults to None, which translates to the last
node of ``v``, to_pos=g.n. See above for details and an example.
hdf_key : str, optional (default=None)
If you initialized ``dg.DeepGraph`` with a ``pandas.HDFStore`` and
the store has multiple nodes, you must pass the key to the node in
the store that corresponds to the node table.
verbose : bool, optional (default=False)
Whether to print information at each step of the iteration process.
logfile : str, optional (default=None)
Create a log-file named by ``logfile``. Contains the time and date
of the method's call, the input arguments and time mesaurements for
each iteration step. A plot of ``logfile`` can be created by
``dg.DeepGraph.plot_logfile``.
Returns
-------
e : pd.DataFrame
Set the created edge table ``e`` as attribute of ``dg.DeepGraph``.
See also
--------
create_edges
Notes
-----
The parameter ``min_chunk_size`` enforces a vectorized iteration and
changing its value can both accelerate or slow down computation time.
This depends mostly on the distribution of values of the fast track
feature, and the complexity of the given ``connectors`` and
``selectors``. Use the logging capabilites to determine a good value.
When using a ``pd.HDFStore`` for the computation, the following advice
might be considered. Recall that the only requirements on the node in
the store are: the format is table(t), not fixed(t); the node is sorted
by the fast-track feature; and the fast-track feature is a data column.
The recommended procedure of storing a given node table ``v`` in a
store is the following (using the above node table):
>>> vstore = pd.HDFStore('vstore.h5')
>>> vstore.put('node_table', v, format='t', data_columns=True,
... index=False)
Setting index=False significantly decreases the time to construct the
node in the store, and also reduces the resulting file size. It has no
impact, however, on the capability of querying the store (with the
pd.HDFStore.select* methods).
However, there are two reasons one might want to create a pytables
index of the fast-track feature:
1. The node table might be too large to be sorted in memory. To sort it
on disc, one may proceed as follows. Assuming an unsorted (large) node
table
>>> v = pd.DataFrame({'time': [6.3,-3.6,4.,-1.1,1.4],
... 'x': [-3.,3.,1.,12.,7.]})
>>> v
time x
0 6.3 -3
1 -3.6 3
2 4.0 1
3 -1.1 12
4 1.4 7
one stores it as recommended
>>> vstore = pd.HDFStore('vstore.h5')
>>> vstore.put('node_table', v, format='t', data_columns=True,
... index=False)
>>> vstore.get_storer('node_table').group.table
/node_table/table (Table(5,)) ''
description := {
"index": Int64Col(shape=(), dflt=0, pos=0),
"time": Float64Col(shape=(), dflt=0.0, pos=1),
"x": Float64Col(shape=(), dflt=0.0, pos=2)}
byteorder := 'little'
chunkshape := (2730,)
creates a (full) pytables index of the fast-track feature
>>> vstore.create_table_index('node_table', columns=['time'],
... kind='full')
>>> vstore.get_storer('node_table').group.table
/node_table/table (Table(5,)) ''
description := {
"index": Int64Col(shape=(), dflt=0, pos=0),
"time": Float64Col(shape=(), dflt=0.0, pos=1),
"x": Float64Col(shape=(), dflt=0.0, pos=2)}
byteorder := 'little'
chunkshape := (2730,)
autoindex := True
colindexes := {
"time": Index(6, full, shuffle, zlib(1)).is_csi=True}
and then sorts it on disc with
>>> vstore.close()
>>> !ptrepack --chunkshape=auto --sortby=time vstore.h5 s_vstore.h5
>>> s_vstore = pd.HDFStore('s_vstore.h5')
>>> s_vstore.node_table
time x
1 -3.6 3
3 -1.1 12
4 1.4 7
2 4.0 1
0 6.3 -3
2. To speed up the internal queries on the fast-track feature
>>> s_vstore.create_table_index('node_table', columns=['time'],
... kind='full')
See
http://stackoverflow.com/questions/17893370/ptrepack-sortby-needs-full-index
and
https://gist.github.com/michaelaye/810bd0720bb1732067ff
for details, benchmarks, and the effects of compressing the store.
"""
# logging
if logfile:
_, _, _, argvalues = inspect.getargvalues(inspect.currentframe())
with open(logfile, "w") as log:
print("# LOG FILE", file=log)
print("# function call on: {}".format(datetime.now()), file=log)
print("#", file=log)
print("# Parameters", file=log)
print("# ----------", file=log)
for arg, value in argvalues.items():
print("# ", (arg, value), end="", file=log)
print("", file=log)
print("#", file=log)
print("# Iterations", file=log)
print("# ----------", file=log)
print("# max_pairs exceeded(1) | nr.of pairs | nr.of edges | " "comp.time(s)\n", file=log)
# measure performance
start_generation = datetime.now()
# v shortcut
v = self.v
# hdf key
if isinstance(v, pd.HDFStore) and hdf_key is None:
assert len(v.keys()) == 1, (
"hdf store has multiple nodes, hdf_key corresponding to the " "node table has to be passed."
)
hdf_key = self.v.keys()[0]
# datetime?
if isinstance(v, pd.HDFStore):
is_datetime = isinstance(pd.Index(v.select_column(hdf_key, ft_feature[0], stop=0)), pd.DatetimeIndex)
else:
is_datetime = isinstance(pd.Index(v.iloc[0:0][ft_feature[0]]), pd.DatetimeIndex)
# for datetime fast track features, split ft_feature
if is_datetime:
assert len(ft_feature) == 3, "for a datetime-like fast track feature, " "the unit has to specified"
dt_unit = ft_feature[-1]
ft_feature = ft_feature[:2]
else:
dt_unit = None
# create empty transfer features list if not given
if transfer_features is None:
transfer_features = []
elif not _is_array_like(transfer_features):
transfer_features = [transfer_features]
# assert that v is sorted by the fast track feature
if isinstance(v, pd.DataFrame):
assert pd.Index(
v[ft_feature[0]]
).is_monotonic_increasing, "The node table is not sorted by the fast track feature."
# initialize
coldtypedic, verboseprint = _initiate_create_edges(
verbose, v, ft_feature, connectors, selectors, r_dtype_dic, transfer_features, no_transfer_rs, hdf_key
)
# iteratively create link data frame (fast track iterator)
self.e = _ft_iterator(
self,
v,
min_chunk_size,
from_pos,
to_pos,
dt_unit,
ft_feature,
coldtypedic,
transfer_features,
max_pairs,
verboseprint,
logfile,
hdf_key,
)
# performance
deltat = datetime.now() - start_generation
verboseprint("")
verboseprint(
"computation time of function call:",
"\ts =",
int(deltat.total_seconds()),
"\tms =",
str(deltat.microseconds / 1000.0)[:6],
"\n",
)
[docs] def partition_nodes(self, features, feature_funcs=None, n_nodes=True, return_gv=False):
"""Return a supernode DataFrame ``sv``.
This is essentially a wrapper around the pandas groupby method: ``sv``
= ``v``.groupby(``features``).agg(``feature_funcs``). It creates a
(intersection) partition of the nodes in ``v`` by the type(s) of
feature(s) ``features``, resulting in a supernode DataFrame ``sv``. By
passing a dictionary of functions on the features of ``v``,
``feature_funcs``, one may aggregate user-defined values of the
partition's elements, the supernodes' features. If ``n_nodes`` is True,
create a column with the number of each supernode's constituent nodes.
If ``return_gv`` is True, return the created groupby object to
facilitate additional operations, such as ``gv``.apply(func, *args,
**kwargs).
For details, type help(``v``.groupby), and/or inspect the available
methods of ``gv``.
For examples, see below. For an in-depth description and mathematical
details of graph partitioning, see
https://arxiv.org/pdf/1604.00971v1.pdf, in particular Sec. III A, E
and F.
Parameters
----------
features : str, int or array_like
Column name(s) of ``v``, indicating the type(s) of feature(s) used
to induce a (intersection) partition. Creates a pandas groupby
object, ``gv`` = ``v``.groupby(``features``).
feature_funcs : dict, optional (default=None)
Each key must be a column name of ``v``, each value either a
function, or a list of functions, working when passed a
``pandas.DataFrame`` or when passed to ``pandas.DataFrame.apply``.
See the docstring of ``gv``.agg for details: help(``gv``.agg).
n_nodes : bool, optional (default=True)
Whether to create a ``n_nodes`` column in ``sv``, indicating the
number of nodes in each supernode.
return_gv : bool, optional (default=False)
If True, also return the ``v``.groupby(``features``) object,
``gv``.
Returns
-------
sv : pd.DataFrame
The aggreated DataFrame of supernodes, ``sv``.
gv : pandas.core.groupby.DataFrameGroupBy
The pandas groupby object, ``v``.groupby(``features``).
See also
--------
partition_edges
partition_graph
Notes
-----
Currently, NA groups in GroupBy are automatically excluded (silently).
One workaround is to use a placeholder (e.g., -1, 'none') for NA values
before doing the groupby (calling this method). See
http://stackoverflow.com/questions/18429491/groupby-columns-with-nan-missing-values
and https://github.com/pydata/pandas/issues/3729.
Examples
--------
First, we need a node table, in order to demonstrate its partitioning:
>>> import pandas as pd
>>> import deepgraph as dg
>>> v = pd.DataFrame({'x': [-3.4,2.1,-1.1,0.9,2.3],
... 'time': [0,0,2,2,9],
... 'color': ['g','g','b','g','r'],
... 'size': [1,3,2,3,1]})
>>> g = dg.DeepGraph(v)
>>> g.v
color size time x
0 g 1 0 -3.4
1 g 3 0 2.1
2 b 2 2 -1.1
3 g 3 2 0.9
4 r 1 9 2.3
Create a partition by the type of feature 'color':
>>> g.partition_nodes('color')
n_nodes
color
b 1
g 3
r 1
Create an intersection partition by the types of features 'color' and
'size' (which is a further refinement of the last partition):
>>> g.partition_nodes(['color', 'size'])
n_nodes
color size
b 2 1
g 1 1
3 2
r 1 1
Partition by 'color' and collect x values:
>>> g.partition_nodes('color', {'time': lambda x: list(x)})
n_nodes time
color
b 1 [2]
g 3 [0, 0, 2]
r 1 [9]
Partition by 'color' and aggregate with different functions:
>>> g.partition_nodes('color', {'time': [lambda x: list(x), np.max],
... 'x': [np.mean, np.sum, np.std]})
n_nodes x_mean x_sum x_std time_<lambda> time_amax
color
b 1 -1.100000 -1.1 NaN [2] 2
g 3 -0.133333 -0.4 2.891943 [0, 0, 2] 2
r 1 2.300000 2.3 NaN [9] 9
"""
# groupby and aggregate
gv = self.v.groupby(features)
sv = _aggregate_super_table(funcs=feature_funcs, size=n_nodes, gt=gv)
if n_nodes:
try:
sv.rename(columns={"size": "n_nodes"}, inplace=True)
except TypeError:
sv = sv.rename(columns={"size": "n_nodes"})
if return_gv:
return sv, gv
else:
return sv
[docs] def partition_edges(
self,
relations=None,
source_features=None,
target_features=None,
relation_funcs=None,
n_edges=True,
return_ge=False,
):
"""Return a superedge DataFrame ``se``.
This method allows you to partition the edges in ``e`` by their types
of relations, but also by the types of features of their incident
source and target nodes, and any combination of the three.
Essentially, this method is a wrapper around the pandas groupby method:
``se`` = ``e``.groupby(``relations`` + features_s +
features_t).agg(``relation_funcs``), where ``relations`` are column
names of ``e``, and in order to group ``e`` by features_s and/or
features_t, the features of type ``source_features`` and/or
``target_features`` (column names of ``v``) are transferred to ``e``,
appending '_s' and/or '_t' to the corresponding column names of ``e``
(if they are not already present). The only requirement on the
combination of ``relations``, ``source_features`` and
``target_features`` is that at least on of the lists has to be of
length >= 1.
By passing a dictionary of functions on the relations of ``e``,
``relation_funcs``, one may aggregate user-defined values of the
partition's elements, the superedges' relations. If ``n_edges`` is
True, create a column with the number of each superedge's constituent
edges. If ``return_ge`` is True, return the created groupby object to
facilitate additional operations, such as ``ge``.apply(func, *args,
**kwargs).
For details, type help(``g.e``.groupby), and/or inspect the available
methods of ``ge``.
For examples, see below. For an in-depth description and mathematical
details of graph partitioning, see
https://arxiv.org/pdf/1604.00971v1.pdf, in particular Sec. III B, E
and F.
Parameters
----------
relations : str, int or array_like, optional (default=None)
Column name(s) of ``e``, indicating the type(s) of relation(s) used
to induce a (intersection) partition of ``e`` (in conjunction with
``source_features`` and ``target_features``).
source_features : str, int or array_like, optional (default=None)
Column name(s) of ``v``, indicating the type(s) of feature(s) of
the edges' incident source nodes used to induce a (intersection)
partition of ``e`` (in conjunction with ``relations`` and
``target_features``).
target_features : str, int or array_like, optional (default=None)
Column name(s) of ``v``, indicating the type(s) of feature(s) of
the edges' incident target nodes used to induce a (intersection)
partition of ``e`` (in conjunction with ``relations`` and
``source_features``).
relation_funcs : dict, optional (default=None)
Each key must be a column name of ``e``, each value a (list of)
function(s), working when passed a ``pandas.DataFrame`` or when
passed to ``pandas.DataFrame.apply``. See the docstring of
``ge``.agg for details: help(``ge``.agg).
n_edges : bool, optional (default=True)
Whether to create a ``n_edges`` column in ``se``, indicating the
number of edges in each superedge.
return_ge : bool, optional (default=False)
If True, also return the pandas groupby object, ``ge``.
Returns
-------
se : pd.DataFrame
The aggreated DataFrame of superedges, ``se``.
ge : pandas.core.groupby.DataFrameGroupBy
The pandas groupby object, ``ge``.
See also
--------
partition_nodes
partition_graph
Notes
-----
Currently, NA groups in GroupBy are automatically excluded (silently).
One workaround is to use a placeholder (e.g., -1, 'none') for NA values
before doing the groupby (calling this method). See
http://stackoverflow.com/questions/18429491/groupby-columns-with-nan-missing-values
and https://github.com/pydata/pandas/issues/3729.
Examples
--------
First, we need to create a graph in order to demonstrate how to
partition its edge set.
Create a node table:
>>> import pandas as pd
>>> import deepgraph as dg
>>> v = pd.DataFrame({'x': [-3.4,2.1,-1.1,0.9,2.3],
... 'time': [0,1,2,5,9],
... 'color': ['g','g','b','g','r'],
... 'size': [1,3,2,3,1]})
>>> g = dg.DeepGraph(v)
>>> g.v
color size time x
0 g 1 0 -3.4
1 g 3 1 2.1
2 b 2 2 -1.1
3 g 3 5 0.9
4 r 1 9 2.3
Create an edge table:
>>> def some_relations(ft_r, x_s,x_t,color_s,color_t,size_s,size_t):
... dx = x_t - x_s
... v = dx / ft_r
... same_color = color_s == color_t
... larger_than = size_s > size_t
... return dx, v, same_color, larger_than
>>> g.create_edges_ft(('time', 5), connectors=some_relations)
>>> g.e.rename(columns={'ft_r': 'dt'}, inplace=True)
>>> g.e['inds'] = g.e.index.values # to ease the eyes
>>> g.e
dx dt larger_than same_color v inds
s t
0 1 5.5 1 False True 5.500000 (0, 1)
2 2.3 2 False False 1.150000 (0, 2)
3 4.3 5 False True 0.860000 (0, 3)
1 2 -3.2 1 True False -3.200000 (1, 2)
3 -1.2 4 False True -0.300000 (1, 3)
2 3 2.0 3 False False 0.666667 (2, 3)
3 4 1.4 4 True False 0.350000 (3, 4)
Partitioning by the type of relation 'larger_than':
>>> g.partition_edges(relations='larger_than',
... relation_funcs={'dx': ['mean', 'std'],
... 'same_color': 'sum'})
n_edges same_color_sum dx_mean dx_std
larger_than
False 5 3 2.58 2.558711
True 2 0 -0.90 3.252691
A refinement of the last partition by the type of relation
'same_color':
>>> g.partition_edges(relations=['larger_than', 'same_color'],
... relation_funcs={'dx': ['mean', 'std'],
... 'dt': lambda x: tuple(x)})
n_edges dt_<lambda> dx_mean dx_std
larger_than same_color
False False 2 (2, 3) 2.150000 0.212132
True 3 (1, 5, 4) 2.866667 3.572581
True False 2 (1, 4) -0.900000 3.252691
Partitioning by the type of source feature 'color':
>>> g.partition_edges(source_features='color',
... relation_funcs={'same_color': 'sum'})
n_edges same_color
color_s
b 1 0
g 6 3
As one can see, the type of feature 'color' of the source nodes has
been transferred to ``e``:
>>> g.e
dx dt larger_than same_color v inds color_s
s t
0 1 5.5 1 False True 5.500000 (0, 1) g
2 2.3 2 False False 1.150000 (0, 2) g
3 4.3 5 False True 0.860000 (0, 3) g
1 2 -3.2 1 True False -3.200000 (1, 2) g
3 -1.2 4 False True -0.300000 (1, 3) g
2 3 2.0 3 False False 0.666667 (2, 3) b
3 4 1.4 4 True False 0.350000 (3, 4) g
A further refinement of the last partition by the type of source
feature 'size':
>>> g.partition_edges(source_features=['color', 'size'],
... relation_funcs={'same_color': 'sum',
... 'inds': lambda x: tuple(x)})
n_edges same_color inds
color_s size_s
b 2 1 0 ((2, 3),)
g 1 3 2 ((0, 1), (0, 2), (0, 3))
3 3 1 ((1, 2), (1, 3), (3, 4))
Partitioning by the types of target features ('color', 'size'):
>>> g.partition_edges(target_features=['color', 'size'],
... relation_funcs={'same_color': 'sum',
... 'inds': lambda x: tuple(x)})
n_edges same_color inds
color_t size_t
b 2 2 0 ((0, 2), (1, 2))
g 3 4 3 ((0, 1), (0, 3), (1, 3), (2, 3))
r 1 1 0 ((3, 4),)
Partitioning by the type of source feature 'color' and the type of
target feature 'size':
>>> g.partition_edges(source_features='color', target_features='size',
... relation_funcs={'same_color': 'sum',
... 'inds': lambda x: tuple(x)})
n_edges same_color inds
color_s size_t
b 3 1 0 ((2, 3),)
g 1 1 0 ((3, 4),)
2 2 0 ((0, 2), (1, 2))
3 3 3 ((0, 1), (0, 3), (1, 3))
A further refinement of the last partition by the type of relation
'larger_than':
>>> g.partition_edges(relations='larger_than',
... source_features='color', target_features='size',
... relation_funcs={'inds': lambda x: tuple(x)})
n_edges inds
larger_than color_s size_t
False b 3 1 ((2, 3),)
g 2 1 ((0, 2),)
3 3 ((0, 1), (0, 3), (1, 3))
True g 1 1 ((3, 4),)
2 1 ((1, 2),)
"""
if not relations:
relations = []
if not _is_array_like(relations):
relations = [relations]
# transfer feature columns to g.e, for fast groupby
if source_features:
if not _is_array_like(source_features):
source_features = [source_features]
cols_s = []
for sf_col in source_features:
cols_s.append(sf_col + "_s")
if sf_col + "_s" not in self.e.columns:
s = self.e.index.get_level_values(0)
self.e.loc[:, sf_col + "_s"] = self.v.loc[s, sf_col].values
else:
cols_s = []
if target_features:
if not _is_array_like(target_features):
target_features = [target_features]
cols_t = []
for tf_col in target_features:
cols_t.append(tf_col + "_t")
if tf_col + "_t" not in self.e.columns:
s = self.e.index.get_level_values(1)
self.e.loc[:, tf_col + "_t"] = self.v.loc[s, tf_col].values
else:
cols_t = []
cols = relations + cols_s + cols_t
ge = self.e.groupby(cols)
se = _aggregate_super_table(funcs=relation_funcs, size=n_edges, gt=ge)
if n_edges:
se = se.rename(columns={"size": "n_edges"})
if return_ge:
return se, ge
else:
return se
[docs] def partition_graph(
self, features, feature_funcs=None, relation_funcs=None, n_nodes=True, n_edges=True, return_gve=False
):
"""Return supergraph DataFrames ``sv`` and ``se``.
This method allows partitioning of the graph represented by ``v`` and
``e`` into a supergraph, ``sv`` and ``se``. It creates a (intersection)
partition of the nodes in ``v`` by the type(s) of feature(s)
``features``, together with the (intersection) partition's
**corresponding** partition of the edges in ``e``.
Essentially, this method is a wrapper around pandas groupby methods:
``sv`` = ``v``.groupby(``features``).agg(``feature_funcs``) and
``se`` = ``e``.groupby(features_s+features_t).agg(``relation_funcs``).
In order to group ``e`` by features_s and features_t, the features of
type ``features`` are transferred to ``e``, appending '_s' and '_t' to
the corresponding column names of ``e``, indicating source and target
features, respectively (if they are not already present).
By passing a dictionary of functions on the features (relations) of
``v`` (``e``), ``feature_funcs`` (``relation_funcs``), one may
aggregate user-defined values of the partition's elements, the
supernodes' (superedges') features (relations). If ``n_nodes``
(``n_edges``) is True, create a column with the number of each
supernode's (superedge's) constituent nodes (edges).
If ``return_gve`` is True, return the created groupby objects to
facilitate additional operations, such as ``gv``.apply(func, *args,
**kwargs) or ``ge``.apply(func, *args, **kwargs).
For details, type help(``g.v``.groupby), and/or inspect the available
methods of ``gv``.
For examples, see below. For an in-depth description and mathematical
details of graph partitioning, see
https://arxiv.org/pdf/1604.00971v1.pdf, in particular Sec. III C, E
and F.
Parameters
----------
features : str, int or array_like
Column name(s) of ``v``, indicating the type(s) of feature(s) used
to induce a (intersection) partition of ``v``, and its
**corresponding** partition of the edges in ``e``. Creates pandas
groupby objects, ``gv`` and ``ge``.
feature_funcs : dict, optional (default=None)
Each key must be a column name of ``v``, each value either a
function, or a list of functions, working when passed a
``pandas.DataFrame`` or when passed to ``pandas.DataFrame.apply``.
See the docstring of ``gv``.agg for details: help(``gv``.agg).
relation_funcs : dict, optional (default=None)
Each key must be a column name of ``e``, each value either a
function, or a list of functions, working when passed a
``pandas.DataFrame`` or when passed to ``pandas.DataFrame.apply``.
See the docstring of ``ge``.agg for details: help(``ge``.agg).
n_nodes : bool, optional (default=True)
Whether to create a ``n_nodes`` column in ``sv``, indicating the
number of nodes in each supernode.
n_edges : bool, optional (default=True)
Whether to create a ``n_edges`` column in ``se``, indicating the
number of edges in each superedge.
return_gve : bool, optional (default=False)
If True, also return the pandas groupby objects, ``gv`` and ``ge``.
Returns
-------
sv : pd.DataFrame
The aggreated DataFrame of supernodes, ``sv``.
se : pd.DataFrame
The aggregated DataFrame of superedges, ``se``.
gv : pandas.core.groupby.DataFrameGroupBy
The pandas groupby object, ``v``.groupby(``features``).
ge : pandas.core.groupby.DataFrameGroupBy
The pandas groupby object, ``e``.groupby(features_i+feaures_j).
See also
--------
partition_nodes
partition_edges
Notes
-----
Currently, NA groups in GroupBy are automatically excluded (silently).
One workaround is to use a placeholder (e.g., -1, 'none') for NA values
before doing the groupby (calling this method). See
http://stackoverflow.com/questions/18429491/groupby-columns-with-nan-missing-values
and https://github.com/pydata/pandas/issues/3729.
Examples
--------
First, we need to create a graph in order to demonstrate its
partitioning into a supergraph.
Create a node table:
>>> import pandas as pd
>>> import deepgraph as dg
>>> v = pd.DataFrame({'x': [-3.4,2.1,-1.1,0.9,2.3],
... 'time': [0,1,2,5,9],
... 'color': ['g','g','b','g','r'],
... 'size': [1,3,2,3,1]})
>>> g = dg.DeepGraph(v)
>>> g.v
color size time x
0 g 1 0 -3.4
1 g 3 1 2.1
2 b 2 2 -1.1
3 g 3 5 0.9
4 r 1 9 2.3
Create an edge table:
>>> def some_relations(ft_r, x_s,x_t,color_s,color_t,size_s,size_t):
... dx = x_t - x_s
... v = dx / ft_r
... same_color = color_s == color_t
... larger_than = size_s > size_t
... return dx, v, same_color, larger_than
>>> g.create_edges_ft(('time', 5), connectors=some_relations)
>>> g.e.rename(columns={'ft_r': 'dt'}, inplace=True)
>>> g.e['inds'] = g.e.index.values # to ease the eyes
>>> g.e
dx dt larger_than same_color v inds
s t
0 1 5.5 1 False True 5.500000 (0, 1)
2 2.3 2 False False 1.150000 (0, 2)
3 4.3 5 False True 0.860000 (0, 3)
1 2 -3.2 1 True False -3.200000 (1, 2)
3 -1.2 4 False True -0.300000 (1, 3)
2 3 2.0 3 False False 0.666667 (2, 3)
3 4 1.4 4 True False 0.350000 (3, 4)
Create a supergraph by partitioning by the type of feature 'color':
>>> sv, se = g.partition_graph('color')
>>> sv
n_nodes
color
b 1
g 3
r 1
>>> se
n_edges
color_s color_t
b g 1
g b 2
g 3
r 1
Create intersection partitions by the types of features 'color' and
'size' (which are further refinements of the last partitions):
>>> sv, se = g.partition_graph(
... ['color', 'size'],
... relation_funcs={'inds': lambda x: tuple(x)})
>>> sv
n_nodes
color size
b 2 1
g 1 1
3 2
r 1 1
>>> se
n_edges inds
color_s size_s color_t size_t
b 2 g 3 1 ((2, 3),)
g 1 b 2 1 ((0, 2),)
g 3 2 ((0, 1), (0, 3))
3 b 2 1 ((1, 2),)
g 3 1 ((1, 3),)
r 1 1 ((3, 4),)
Partition by 'color' and aggregate some properties:
>>> sv, se = g.partition_graph('color',
... feature_funcs={'time': lambda x: list(x)},
... relation_funcs={'larger_than': 'sum', 'same_color': 'sum'})
>>> sv
n_nodes time
color
b 1 [2]
g 3 [0, 1, 5]
r 1 [9]
>>> se
n_edges larger_than same_color
color_s color_t
b g 1 False 0
g b 2 True 0
g 3 False 3
r 1 True 0
"""
gv = self.v.groupby(features)
sv = _aggregate_super_table(funcs=feature_funcs, size=n_nodes, gt=gv)
if n_nodes:
sv.rename(columns={"size": "n_nodes"}, inplace=True)
# transfer feature columns to g.e, for fast groupby
cols_s = []
cols_t = []
if not _is_array_like(features):
features = [features]
for col in features:
cols_s.append(col + "_s")
cols_t.append(col + "_t")
if col + "_s" not in self.e.columns:
s = self.e.index.get_level_values(0)
self.e.loc[:, col + "_s"] = self.v.loc[s, col].values
if col + "_t" not in self.e.columns:
t = self.e.index.get_level_values(1)
self.e.loc[:, col + "_t"] = self.v.loc[t, col].values
ge = self.e.groupby(cols_s + cols_t)
se = _aggregate_super_table(funcs=relation_funcs, size=n_edges, gt=ge)
if n_edges:
se = se.rename(columns={"size": "n_edges"})
if return_gve:
return sv, se, gv, ge
else:
return sv, se
[docs] def return_cs_graph(self, relations=False, dropna=True):
"""Return ``scipy.sparse.coo_matrix`` representation(s).
Create a compressed sparse graph representation for each type of
relation given by ``relations``. ``relations`` can either be False,
True, or a (list of) column name(s) of ``e``. If ``relations`` is False
(default), return a single csgraph entailing all edges in ``e.index``,
each with a weight of 1 (in that case, ``dropna`` is discarded). If
``relations`` is True, create one csgraph for each column of ``e``,
where the weights are given by the columns' values. If only a subset of
columns is to be mapped to csgraphs, ``relations`` has to be a (list
of) column name(s) of ``e``.
The argument ``dropna`` indicates whether to discard edges with NA
values or not. If ``dropna`` is True or False, it applies to all types
of relations given by ``relations``. However, ``dropna`` can also be
array_like with the same shape as ``relations`` (or with the same shape
as ``e.columns``, if ``relations`` is True).
Parameters
----------
relations : bool, str or array_like, optional (default=False)
The types of relations to be mapped to scipy csgraphs. Can be
False, True, or a (list of) column name(s) of ``e``.
dropna : bool or array_like, optional (default=True)
Whether to drop edges with NA values. If True or False, applies to
all relations given by ``relations``. Otherwise, must be the same
shape as ``relations``. If ``relations`` is False, ``dropna`` is
discarded.
Returns
-------
csgraph : scipy.sparse.coo_matrix or dict
A dictionary, where keys are column names of ``e``, and values are
the corresponding ``scipy.sparse.coo_matrix`` instance(s). If only
one csgraph is created, return it directly.
See also
--------
return_nx_graph
return_nx_multigraph
return_gt_graph
"""
from scipy.sparse import coo_matrix
# get indices
index = self.v.index
indices = index.values
n = len(indices)
# enumerate indices if necessary
if type(index) is pd.RangeIndex:
if index.start == 0 and index.stop == n:
inddic = None
else:
inddic = {j: i for i, j in enumerate(indices)}
else:
inddic = {j: i for i, j in enumerate(indices)}
# for default arguments
if relations is False:
s = self.e.index.get_level_values(0).values
t = self.e.index.get_level_values(1).values
if inddic:
s = _dic_translator(s, inddic)
t = _dic_translator(t, inddic)
else:
pass
# create cs graph
cs_g = coo_matrix((np.ones(len(s), dtype=bool), (s, t)), shape=(n, n), dtype=bool)
else:
if relations is True:
relations = self.e.columns.values
# check that relations and dropna have the same shape
if _is_array_like(relations) and _is_array_like(dropna):
assert len(relations) == len(dropna), "dropna and relations have different shapes!"
if not _is_array_like(relations):
relations = [relations]
if not _is_array_like(dropna):
dropna = [dropna] * len(relations)
# create coo_matrices
cs_g = {}
for r, drop in zip(relations, dropna):
if drop:
data = self.e[r].dropna()
else:
data = self.e[r]
s = data.index.get_level_values(0).values
t = data.index.get_level_values(1).values
if inddic:
s = _dic_translator(s, inddic)
t = _dic_translator(t, inddic)
else:
pass
# create cs graph
cs_g[r] = coo_matrix((data.values, (s, t)), shape=(n, n), dtype=data.dtype)
# if there is only one csgraph
if len(cs_g) == 1:
cs_g = cs_g[r]
return cs_g
[docs] def return_nx_graph(self, features=False, relations=False, dropna="none"):
"""Return a ``networkx.DiGraph`` representation.
Create a ``networkx.DiGraph`` representation of the graph given by
``v`` and ``e``. Node and edge properties to transfer can be indicated
by the ``features`` and ``relations`` input arguments. Whether to drop
edges with NA values in the subset of types of relations given by
``relations`` can be controlled by ``dropna``.
Needs pandas >= 0.17.0.
Parameters
----------
features : bool, str, or array_like, optional (default=False)
Indicates which types of features to transfer as node attributes.
Can be column name(s) of ``v``, False or True. If False, create no
node attributes. If True, create node attributes for every column
in ``v``. If str or array_like, must be column name(s) of ``v``
indicating which types of features to transfer.
relations : bool, str, or array_like, optional (default=False)
Indicates which types of relations to transfer as edge attributes.
Can be column name(s) of ``e``, False or True. If False, create no
edge attributes (all edges in ``e.index`` are transferred,
regardless of ``dropna``). If True, create edge attributes for
every column in ``e`` (all edges in ``e.index`` are transferred,
regardless of ``dropna``). If str or array_like, must be column
name(s) of ``e`` indicating which types of relations to transfer
(which edges are transferred can be controlled by ``dropna``).
dropna : str, optional (default='none')
One of {'none','any','all'}. If 'none', all edges in ``e.index``
are transferred. If 'any', drop all edges (rows) in
``e[relations]`` where any NA values are present. If 'all', drop
all edges (rows) in ``e[relations]`` where all values are NA. Only
has an effect if ``relations`` is str or array_like.
Returns
-------
nx_g : networkx.DiGraph
See also
--------
return_nx_multigraph
return_cs_graph
return_gt_graph
"""
import networkx as nx
# create empty DiGraph
nx_g = nx.DiGraph()
# select features
if features is False:
vt = pd.DataFrame(index=self.v.index)
elif features is True:
vt = self.v
elif _is_array_like(features):
vt = self.v[features]
else:
vt = self.v[features].to_frame()
# create nx compatible tuple, (index, weight_dict)
vt = vt.to_dict("index")
vt = ((key, value) for key, value in vt.items())
# add nodes
nx_g.add_nodes_from(vt)
# select relations
if hasattr(self, "e"):
if relations is False:
et = pd.DataFrame(index=self.e.index)
elif relations is True:
et = self.e
elif _is_array_like(relations):
if dropna != "none":
et = self.e[relations].dropna(how=dropna)
else:
et = self.e[relations]
else:
if dropna != "none":
et = self.e[relations].to_frame().dropna(how=dropna)
else:
et = self.e[relations].to_frame()
# create nx compatible tuple, (index, index, weight_dict)
et = et.to_dict("index")
et = ((key[0], key[1], value) for key, value in et.items())
# add edges
nx_g.add_edges_from(et)
return nx_g
[docs] def return_nx_multigraph(self, features=False, relations=False, dropna=True):
"""Return a ``networkx.MultiDiGraph`` representation.
Create a ``networkx.MultiDiGraph`` representation of the graph given by
``v`` and ``e``. As opposed to ``return_nx_graph``, where every row of
``e`` is treated as one edge, this method treats every cell of
``e`` as one edge. The input argument ``features`` indicates which node
properties to transfer. ``relations`` indicates which edges to
transfer. Whether to drop edges with NA values can be controlled by
``dropna``.
Needs pandas >= 0.17.0.
Parameters
----------
features : bool, str, or array_like, optional (default=False)
Indicates which types of features to transfer as node attributes.
Can be column name(s) of ``v``, False or True. If False, create no
node attributes. If True, create node attributes for every column
in ``v``. If str or array_like, must be column name(s) of ``v``
indicating which types of features to transfer.
relations : bool, str, or array_like, optional (default=False)
Indicates which cells of ``e`` to transfer as edges. Can be False,
True, or a (list of) column name(s) of ``e``. If False (default),
all cells of ``e`` are translated to edges, but their values are
not transferred as edge attributes. If True, all cells of ``e`` are
translated, and their values are transferred as edge attributes. If
str or array_like, must be column name(s) of ``e``, restricting the
translation of cells to edges to ``e[relations]`` (values are
transferred as edge attributes).
dropna : bool, optional (default=True)
Whether to drop edges with NA values. Cells in ``e`` with NA values
are not translated to edges.
Returns
-------
nx_g : networkx.MultiDiGraph
See also
--------
return_nx_graph
return_cs_graph
return_gt_graph
"""
import networkx as nx
# create empty MultiDiGraph
nx_g = nx.MultiDiGraph()
# select features
if features is False:
vt = pd.DataFrame(index=self.v.index)
elif features is True:
vt = self.v
elif _is_array_like(features):
vt = self.v[features]
else:
vt = self.v[features].to_frame()
# create nx compatible tuple, (index, weight_dict)
vt = vt.to_dict("index")
vt = ((key, value) for key, value in vt.items())
# add nodes
nx_g.add_nodes_from(vt)
# select relations
if hasattr(self, "e"):
if relations is False:
if dropna:
et = self.e.count(axis=1).to_dict()
et = ((key,) * value for key, value in et.items())
et = chain(*et)
else:
ncol = len(self.e.columns)
et = self.e.index
et = ((key,) * ncol for key in et)
et = chain(*et)
elif relations is True:
et = _iter_edges(self.e, dropna)
elif _is_array_like(relations):
et = self.e[relations]
et = _iter_edges(et, dropna)
else:
et = self.e[relations].to_frame()
et = _iter_edges(et, dropna)
# add edges
nx_g.add_edges_from(et)
return nx_g
[docs] def return_gt_graph(self, features=False, relations=False, dropna="none", node_indices=False, edge_indices=False):
"""Return a ``graph_tool.Graph`` representation.
Create a ``graph_tool.Graph`` (directed) representation of the graph
given by ``v`` and ``e``. Node and edge properties to transfer can be
indicated by the ``features`` and ``relations`` input arguments.
Whether to drop edges with NA values in the subset of types of
relations given by ``relations`` can be controlled by ``dropna``. If
the nodes in ``v`` are not indexed by consecutive integers starting
from 0, one may internalize the original node and edge indices as
propertymaps by setting ``node_indices`` and/or ``edge_indices`` to
True.
Parameters
----------
features : bool, str, or array_like, optional (default=False)
Indicates which types of features to internalize as
``graph_tool.PropertyMap``. Can be column name(s) of ``v``, False
or True. If False, create no propertymaps. If True, create
propertymaps for every column in ``v``. If str or array_like, must
be column name(s) of ``v`` indicating which types of features to
internalize.
relations : bool, str, or array_like, optional (default=False)
Indicates which types of relations to internalize as
``graph_tool.PropertyMap``. Can be column name(s) of ``e``, False
or True. If False, create no propertymaps (all edges in ``e.index``
are transferred, regardless of ``dropna``). If True, create
propertymaps for every column in ``e`` (all edges in ``e.index``
are transferred, regardless of ``dropna``). If str or array_like,
must be column name(s) of ``e`` indicating which types of relations
to internalize (which edges are transferred can be controlled by
``dropna``).
dropna : str, optional (default='none')
One of {'none','any','all'}. If 'none', all edges in ``e.index``
are transferred. If 'any', drop all edges (rows) in
``e[relations]`` where any NA values are present. If 'all', drop
all edges (rows) in ``e[relations]`` where all values are NA. Only
has an effect if ``relations`` is str or array_like.
node_indices : bool, optional (default=False)
If True, internalize a vertex propertymap ``i`` with the original
node indices.
edge_indices : bool, optional (default=False)
If True, internalize edge propertymaps ``s`` and ``t`` with the
original source and target node indices of the edges, respectively.
Returns
-------
gt_g : graph_tool.Graph
See also
--------
return_cs_graph
return_nx_graph
return_nx_multigraph
Notes
-----
If the index of ``v`` is not pd.RangeIndex(start=0,stop=len(``v``),
step=1), the indices will be enumerated, which is expensive for large
graphs.
"""
import graph_tool as gt
# propertymap dtypes
dtdic = {
"bool": "bool",
# int16_t: 'short',
"uint8": "int16_t",
"int8": "int16_t",
"int16": "int16_t",
# int32_t: 'int',
"uint16": "int32_t",
"int32": "int32_t",
# int64_t: 'long',
"uint32": "int64_t",
"int64": "int64_t",
"uint64": "int64_t",
# double: 'float',
"float16": "double",
"float32": "double",
"float64": "double",
"float128": "double",
}
# get indices
index = self.v.index
indices = index.values
n = len(indices)
# enumerate indices if necessary
if type(index) is pd.RangeIndex:
if index.start == 0 and index.stop == n:
inddic = None
else:
inddic = {j: i for i, j in enumerate(indices)}
else:
inddic = {j: i for i, j in enumerate(indices)}
# create empty Graph
gt_g = gt.Graph(directed=True)
# select features
if features is False:
vt = pd.DataFrame(index=index)
elif features is True:
vt = self.v
elif _is_array_like(features):
vt = self.v[features]
else:
vt = self.v[features].to_frame()
# add nodes
gt_g.add_vertex(n)
# add vertex propertymaps
if node_indices:
try:
pm = gt_g.new_vertex_property(dtdic[str(index.dtype)], indices)
except KeyError:
pm = gt_g.new_vertex_property("object", indices)
# internalize
gt_g.vertex_properties["i"] = pm
for col in vt.columns:
try:
pm = gt_g.new_vertex_property(dtdic[str(vt[col].dtype)], vt[col].values)
except KeyError:
pm = gt_g.new_vertex_property("object", vt[col].values)
# internalize
gt_g.vertex_properties[str(col)] = pm
# select relations
if hasattr(self, "e"):
if relations is False:
et = pd.DataFrame(index=self.e.index)
elif relations is True:
et = self.e
elif _is_array_like(relations):
if dropna != "none":
et = self.e[relations].dropna(how=dropna)
else:
et = self.e[relations]
else:
if dropna != "none":
et = self.e[relations].to_frame().dropna(how=dropna)
else:
et = self.e[relations].to_frame()
# add edges
s = et.index.get_level_values(level=0).values
t = et.index.get_level_values(level=1).values
if inddic:
ns = _dic_translator(s, inddic).astype(int)
nt = _dic_translator(t, inddic).astype(int)
gt_g.add_edge_list(np.column_stack((ns, nt)))
del ns, nt
else:
gt_g.add_edge_list(np.column_stack((s, t)))
# add edge propertymaps
if edge_indices:
try:
s = gt_g.new_edge_property(dtdic[str(s.dtype)], s)
t = gt_g.new_edge_property(dtdic[str(t.dtype)], t)
except KeyError:
s = gt_g.new_edge_property("object", s)
t = gt_g.new_edge_property("object", t)
# internalize
gt_g.edge_properties["s"] = s
gt_g.edge_properties["t"] = t
for col in et.columns:
try:
pm = gt_g.new_edge_property(dtdic[str(et[col].dtype)], et[col].values)
except KeyError:
pm = gt_g.new_edge_property("object", et[col].values)
# internalize
gt_g.edge_properties[str(col)] = pm
return gt_g
[docs] def append_cp(
self, directed=False, connection="weak", col_name="cp", label_by_size=True, consolidate_singles=False
):
"""Append a component membership column to ``v``.
Append a column to ``v`` indicating the component membership of each
node. Requires scipy.
Parameters
----------
directed : bool, optional (default=False)
If True , then operate on a directed graph: only move from point i
to point j along paths csgraph[i, j]. If False, then find the
shortest path on an undirected graph: the algorithm can progress
from point i to j along csgraph[i, j] or csgraph[j, i].
connection : str, optional (default='weak')
One of {'weak','strong'}. For directed graphs, the type of
connection to use. Nodes i and j are strongly connected if a path
exists both from i to j and from j to i. Nodes i and j are weakly
connected if only one of these paths exists. Only has an effect if
``directed`` is True
col_name : str, optional (default='cp')
The name of the appended column of component labels.
label_by_size : bool, optional (default=True)
Whether to rename component membership labels to reflect component
sizes. If True, the smallest component corresponds to the largest
label, and the largest component corresponds to the label 0 (or 1
if ``consolidate_singles`` is True). If False, pass on labels given
by scipy's connected_components method directly (faster and uses
less memory).
consolidate_singles: bool, optional (default=False)
If True, all singular components (components comprised of one node
only) are consolidated under the label 0. Also, all other labels
are renamed to reflect component sizes, see ``label_by_size``.
Returns
-------
v : pd.DataFrame
appends an extra column to ``v`` indicating component membership.
"""
from scipy.sparse.csgraph import connected_components
# create cs graph
cs_g = self.return_cs_graph()
# find components
labels = connected_components(cs_g, directed=directed, connection=connection)[1]
# append cp column to v
self.v[col_name] = labels
# if indicated, consolidate singular components and label by size
if consolidate_singles:
cp_counts = self.v[col_name].value_counts()
# if there are singular components
f1cp = len(cp_counts) - np.searchsorted(cp_counts.values[::-1], 2)
rndic = {j: i + 1 for i, j in enumerate(cp_counts.index[:f1cp])}
rndic.update({i: 0 for i in cp_counts.index[f1cp:]})
# relabel cp column
self.v[col_name] = self.v[col_name].apply(lambda x: rndic[x])
# if indicated, label by size
elif label_by_size:
cp_counts = self.v[col_name].value_counts()
rndic = {j: i for i, j in enumerate(cp_counts.index)}
# relabel cp column
self.v[col_name] = self.v[col_name].apply(lambda x: rndic[x])
[docs] def append_binning_labels_v(self, col, col_name, bins=10, log_bins=False, floor=False, return_bin_edges=False):
"""Append a column with binning labels of the values in ``v[col]``.
Append a column ``col_name`` to ``v`` with the indices of the bins to
which each value in ``v[col]`` belongs to.
If ``bins`` is an int, it determines the number of bins to create. If
``log_bins`` is True, this number determines the (approximate) number
of bins to create for each magnitude. For linear bins, it is the number
of bins for the whole range of values. If ``floor`` is set True, the
bin edges are floored to the closest integer. If ``return_bin_edges``
is set True, the created bin edges are returned.
If ``bins`` is a sequence, it defines the bin edges, including the
rightmost edge, allowing for non-uniform bin widths.
See ``np.digitize`` for details.
Parameters
----------
col : int or str
A column name of ``v``, whose corresponding values are binned and
labelled.
col_name : str
The column name for the created labels.
bins : int or array_lke, optional (default=10)
If ``bins`` is an int, it determines the number of bins to create.
If ``log_bins`` is True, this number determines the (approximate)
number of bins to create for each magnitude. For linear bins, it is
the number of bins for the whole range of values. If ``bins`` is a
sequence, it defines the bin edges, including the rightmost edge,
allowing for non-uniform bin widths.
log_bins : bool, optional (default=False)
Whether to use logarithmically or linearly spaced bins.
floor : bool, optional (default=False)
Whether to floor the bin edges to the closest integers.
return_bin_edges : bool, optional (default=False)
Whether to return the bin edges.
Returns
-------
v : pd.DataFrame
Appends an extra column ``col_name`` to ``v`` with the binning
labels.
bin_edges : np.ndarray
Optionally, return the created bin edges.
Examples
--------
First, we need a node table:
>>> import pandas as pd
>>> import deepgraph as dg
>>> v = pd.DataFrame({'time': [1,2,12,105,899]})
>>> g = dg.DeepGraph(v)
>>> g.v
time
0 1
1 2
2 12
3 105
4 899
Binning time values with default arguments:
>>> bin_edges = g.append_binning_labels_v('time', 'time_l',
... return_bin_edges=True)
>>> bin_edges
array([ 1. , 100.77777778, 200.55555556, 300.33333333,
400.11111111, 499.88888889, 599.66666667, 699.44444444,
799.22222222, 899. ])
>>> g.v
time time_l
0 1 1
1 2 1
2 12 1
3 105 2
4 899 10
Binning time values with logarithmically spaced bins:
>>> bin_edges = g.append_binning_labels_v('time', 'time_l', bins=5,
... log_bins=True,
... return_bin_edges=True)
>>> bin_edges
array([ 1. , 1.62548451, 2.64219989, 4.29485499,
6.98122026, 11.34786539, 18.44577941, 29.9833287 ,
48.73743635, 79.22194781, 128.77404899, 209.32022185,
340.24677814, 553.06586728, 899. ])
>>> g.v
time time_l
0 1 1
1 2 2
2 12 6
3 105 10
4 899 15
Binning time values with logarithmically spaced bins (floored):
>>> bin_edges = g.append_binning_labels_v('time', 'time_l', bins=5,
... log_bins=True, floor=True,
... return_bin_edges=True)
>>> bin_edges
array([ 1., 2., 4., 6., 11., 18., 29., 48., 79.,
128., 209., 340., 553., 899.])
>>> g.v
time time_l
0 1 1
1 2 2
2 12 5
3 105 9
4 899 14
"""
x = self.v[col]
# create bins
if _is_array_like(bins):
bin_edges = bins
else:
bin_edges = _create_bin_edges(x, bins, log_bins, floor)
self.v[col_name] = np.digitize(x, bin_edges)
if return_bin_edges:
return bin_edges
def append_datetime_categories_v(self, col="time", timeofday=None, met_season=None):
"""Append datetime categories to ``v``.
Appends a "time of the day" and/or a meteorological season to ``v``,
based on a given datetime column ``col``.
Parameters
----------
col : str, optional (default='time')
A column of ``v`` comprised of datetimes.
timeofday : str, optional (default=None)
If given, the time of the day is appended as a column with the
label ``timeofday`` to ``v``. The time of the day is defined
as::
[00:06[ = 0 (night)
[06:12[ = 1 (forenoon)
[12:18[ = 2 (afternoon)
[18:24] = 3 (evening)
met_season : str, optional (default=None)
If given, the modern mid-latitude meteorological season, see
http://en.wikipedia.org/wiki/Season#Modern_mid-latitude_meteorological
is appended as a column with the label
``met_season`` to ``v``. The season is defined as:
[12:03[ = 0
[03:06[ = 1
[06:09[ = 2
[09:12[ = 3
Returns
-------
v : pd.DataFrame
appends extra column(s) to ``v`` with datetime properties.
"""
def _timeofday(datetimes):
def categorize(hour):
if hour < 6:
return 0
elif hour >= 6 and hour < 12:
return 1
elif hour >= 12 and hour < 18:
return 2
elif hour >= 18 and hour <= 24:
return 3
hour = datetimes.apply(lambda x: x.hour)
timeofday = hour.apply(categorize).values
return timeofday
def _met_season(datetimes):
def season(month):
if month >= 12 or month < 3:
return 0
elif month >= 3 and month < 6:
return 1
elif month >= 6 and month < 9:
return 2
elif month >= 9 and month < 12:
return 3
month = datetimes.apply(lambda x: x.month)
season = month.apply(season).values
return season
if timeofday:
self.v[timeofday] = _timeofday(self.v[col])
self.v[timeofday] = self.v[timeofday].astype("uint8")
if met_season:
self.v[met_season] = _met_season(self.v[col])
self.v[met_season] = self.v[met_season].astype("uint8")
[docs] def update_edges(self):
"""After removing nodes in ``v``, update ``e``.
If you deleted rows from ``v``, you can remove all edges associated
with the deleted nodes in ``e`` by calling this method.
Returns
-------
e : pd.DataFrame
update ``e``
"""
# reduce edge table
if hasattr(self, "e"):
s = self.e.index.get_level_values(0)
t = self.e.index.get_level_values(1)
self.e = self.e.loc[(s.isin(self.v.index)) & (t.isin(self.v.index))]
[docs] def filter_by_interval_v(self, col, interval, endpoint=True):
"""Keep only nodes in ``v`` with features of type ``col`` in
``interval``.
Remove all nodes from ``v`` (and their corresponding edges in ``e``)
with features of type ``col`` outside the interval given by a tuple of
values. The endpoint is included, if ``endpoint`` is not set to False.
Parameters
----------
col : str or int
A column name of ``v``, indicating the type of feature used in the
filtering.
interval : tuple
A tuple of two values, (value, larger_value). All nodes outside the
interval are removed.
endpoint : bool, optional (default=True)
False excludes the endpoint.
Returns
-------
v : pd.DataFrame
update ``v``
e : pd.DataFrame
update ``e``
"""
# reduce node table
if endpoint:
self.v = self.v[(self.v[col] >= interval[0]) & (self.v[col] <= interval[1])]
else:
self.v = self.v[(self.v[col] >= interval[0]) & (self.v[col] < interval[1])]
# reduce edge table
if hasattr(self, "e"):
self.update_edges()
[docs] def filter_by_interval_e(self, col, interval, endpoint=True):
"""Keep only edges in ``e`` with relations of type ``col`` in
``interval``.
Remove all edges from ``e`` with relations of type ``col`` outside the
interval given by a tuple of values. The endpoint is included, if
``endpoint`` is not set to False.
Parameters
----------
col : str or int
A column name of ``e``, indicating the type of relation used in the
filtering.
interval : tuple
A tuple of two values, (value, larger_value). All edges outside the
interval are removed.
endpoint : bool, optional (default=True)
False excludes the endpoint.
Returns
-------
e : pd.DataFrame
update ``e``
"""
# reduce edge table
if endpoint:
self.e = self.e[(self.e[col] >= interval[0]) & (self.e[col] <= interval[1])]
else:
self.e = self.e[(self.e[col] >= interval[0]) & (self.e[col] < interval[1])]
[docs] def filter_by_values_v(self, col, values):
"""Keep only nodes in ``v`` with features of type ``col`` in
``values``.
Remove all nodes from ``v`` (and their corresponding edges in
``e``) with feature(s) of type ``col`` not in the list of features
given by ``values``.
Parameters
----------
col : str or int
A column name of ``v``, indicating the type of feature used in the
filtering.
values : object or array_like
The value(s) indicating which nodes to keep.
Returns
-------
v : pd.DataFrame
update ``v``
e : pd.DataFrame
update ``e``
"""
# reduce node table
if not _is_array_like(values):
values = [values]
self.v = self.v[(self.v[col].isin(values))]
# reduce edge table
if hasattr(self, "e"):
self.update_edges()
[docs] def filter_by_values_e(self, col, values):
"""Keep only edges in ``e`` with relations of type ``col`` in
``values``.
Remove all edges from ``e`` with relation(s) of type ``col`` not in the
list of relations given by ``values``.
Parameters
----------
col : str or int
A column name of ``e``, indicating the type of relation used in the
filtering.
values : object or array_like
The value(s) indicating which edges to keep.
Returns
-------
e : pd.DataFrame
update ``e``
"""
# reduce node table
if not _is_array_like(values):
values = [values]
self.e = self.e[(self.e[col].isin(values))]
[docs] def plot_2d(
self,
x,
y,
edges=False,
C=None,
C_split_0=None,
kwds_scatter=None,
kwds_quiver=None,
kwds_quiver_0=None,
ax=None,
):
"""Plot nodes and corresponding edges in 2 dimensions.
Create a scatter plot of the nodes in ``v``, and optionally a quiver
plot of the corresponding edges in ``e``.
The xy-coordinates of the scatter plot are determined by the values of
``v[x]`` and ``v[y]``, where ``x`` and ``y`` are column names of ``v``
(the arrow's coordinates are determined automatically).
In order to map colors to the arrows, either ``C`` or ``C_split_0``
can be be passed, an array of the same length as ``e``. Passing ``C``
creates a single quiver plot (qu). Passing ``C_split_0`` creates two
separate quiver plots, one for all edges where ``C_split_0`` == 0
(qu_0), and one for all other edges (qu). By default, the arrows of
qu_0 have no head, indicating "undirected" edges. This can be useful,
for instance, when ``C_split_0`` represents an array of temporal
distances.
In order to control the plotting parameters of the scatter, quiver
and/or quiver_0 plots, one may pass keyword arguments by setting
``kwds_scatter``, ``kwds_quiver`` and/or ``kwds_quiver_0``.
Can be used iteratively by passing ``ax``.
Parameters
----------
x : int or str
A column name of ``v``, determining the x-coordinates of the
scatter plot of nodes.
y : int or str
A column name of ``v``, determining the y-coordinates of the
scatter plot of nodes.
edges : bool, optional (default=True)
Whether to create a quiver plot (2-D field of arrows) of the edges
between the nodes.
C : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. Has no effect if ``C_split_0`` is passed as
an argument.
C_split_0 : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. If this parameter is passed, ``C`` has no
effect, and two separate quiver plots are created (qu and qu_0).
kwds_scatter : dict, optional (default=None)
kwargs to be passed to scatter.
kwds_quiver : dict, optional (default=None)
kwargs to be passed to quiver (qu).
kwds_quiver_0 : dict, optional (default=None)
kwargs to be passed to quiver (qu_0). Only has an effect if
``C_split_0`` has been set.
ax : matplotlib axes object, optional (default=None)
An axes instance to use.
Returns
-------
obj : dict
If ``C_split_0`` has been passed, return a dict of matplotlib
objects with the following keys: ['fig', 'ax', 'pc', 'qu', 'qu_0'].
Otherwise, return a dict with keys: ['fig', 'ax', 'pc', 'qu'].
Notes
-----
When passing ``C_split_0``, the color of the arrows in qu_0 can be set
by passing the keyword argument `color` to ``kwds_quiver_0``. The color
of the arrows in qu, however, are determined by ``C_split_0``.
The default drawing order is set to:
1. quiver_0 (zorder=1)
2. quiver (zorder=2)
3. scatter (zorder=3)
This order can be changed by setting the ``zorder`` in
``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html
See also
--------
plot_2d_generator
plot_3d
plot_map
plot_map_generator
"""
return self._plot_2d(
is_map=False,
x=x,
y=y,
edges=edges,
C=C,
C_split_0=C_split_0,
kwds_scatter=kwds_scatter,
kwds_quiver=kwds_quiver,
kwds_quiver_0=kwds_quiver_0,
kwds_basemap=None,
ax=ax,
m=None,
)
[docs] def plot_2d_generator(
self,
x,
y,
by,
edges=False,
C=None,
C_split_0=None,
kwds_scatter=None,
kwds_quiver=None,
kwds_quiver_0=None,
passable_ax=False,
):
"""Plot nodes and corresponding edges by groups.
Create a generator of scatter plots of the nodes in ``v``, split in
groups by ``v``.groupby(``by``). If edges is set True, also create a
quiver plot of each group's corresponding edges.
The xy-coordinates of the scatter plots are determined by the values of
``v[x]`` and ``v[y]``, where ``x`` and ``y`` are column names of ``v``
(the arrow's coordinates are determined automatically).
In order to map colors to the arrows, either ``C`` or ``C_split_0``
can be be passed, an array of the same length as ``e``. Passing ``C``
creates a single quiver plot (qu). Passing ``C_split_0`` creates two
separate quiver plots, one for all edges where ``C_split_0`` == 0
(qu_0), and one for all other edges (qu). By default, the arrows of
qu_0 have no head, indicating "undirected" edges. This can be useful,
for instance, when ``C_split_0`` represents an array of temporal
distances.
When mapping colors to arrows by setting ``C`` (or ``C_split_0``),
`clim` is automatically set to the min and max values of the entire
array. In case one wants clim to be set to min and max values for each
group's colors, one may explicitly pass `clim` = None to
``kwds_quiver``.
The same behaviour occurs when passing a sequence of ``g.n`` Numbers as
colors `c` to ``kwds_scatter``. In that case, `vmin` and `vmax` are
automatically set to `c`.min() and `c`.max() of all nodes. Explicitly
setting `vmin` and `vmax` to `None`, the min and max values of the
groups' color arrays are used.
In order to control the plotting parameters of the scatter, quiver
and/or quiver_0 plots, one may pass keyword arguments by setting
``kwds_scatter``, ``kwds_quiver`` and/or ``kwds_quiver_0``.
If ``passable_ax`` is True, create a generator of functions. Each
function takes a matplotlib axes object as input, and returns a
scatter/quiver plot.
Parameters
----------
x : int or str
A column name of ``v``, determining the x-coordinates of the
scatter plot of nodes.
y : int or str
A column name of ``v``, determining the y-coordinates of the
scatter plot of nodes.
by : array_like
Column name(s) of ``v``, determining the groups to create plots of.
edges : bool, optional (default=True)
Whether to create a quiver plot (2-D field of arrows) of the edges
between the nodes.
C : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. Has no effect if ``C_split_0`` is passed as
an argument.
C_split_0 : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. If this parameter is passed, ``C`` has no
effect, and two separate quiver plots are created (qu and qu_0).
kwds_scatter : dict, optional (default=None)
kwargs to be passed to scatter.
kwds_quiver : dict, optional (default=None)
kwargs to be passed to quiver (qu).
kwds_quiver_0 : dict, optional (default=None)
kwargs to be passed to quiver (qu_0). Only has an effect if
``C_split_0`` has been set.
passable_ax : bool, optional (default=False)
If True, return a generator of functions. Each function takes a
matplotlib axes object as input, and returns a dict of matplotlib
objects.
Returns
-------
obj : generator
If ``C_split_0`` has been passed, return a generator of dicts of
matplotlib objects with the following keys: ['fig', 'ax', 'pc',
'qu', 'qu_0', 'group']. Otherwise, return a generator of dicts
with keys: ['fig', 'ax', 'pc', 'qu', 'group'].
If ``passable_ax`` is True, return a generator of functions. Each
function takes a matplotlib axes object as input, and returns a
dict as described above.
Notes
-----
When passing ``C_split_0``, the color of the arrows in qu_0 can be set
by passing the keyword argument `color` to ``kwds_quiver_0``. The color
of the arrows in qu, however, are determined by ``C_split_0``.
The default drawing order is set to:
1. quiver_0 (zorder=1)
2. quiver (zorder=2)
3. scatter (zorder=3)
This order can be changed by setting the ``zorder`` in
``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html
See also
--------
append_binning_labels_v
plot_2d
plot_3d
plot_map
plot_map_generator
"""
return self._plot_2d_generator(
is_map=False,
x=x,
y=y,
by=by,
edges=edges,
C=C,
C_split_0=C_split_0,
kwds_basemap=None,
kwds_scatter=kwds_scatter,
kwds_quiver=kwds_quiver,
kwds_quiver_0=kwds_quiver_0,
passable_ax=passable_ax,
)
[docs] def plot_map(
self,
lon,
lat,
edges=False,
C=None,
C_split_0=None,
kwds_basemap=None,
kwds_scatter=None,
kwds_quiver=None,
kwds_quiver_0=None,
ax=None,
m=None,
):
"""Plot nodes and corresponding edges on a basemap.
Create a scatter plot of the nodes in ``v`` and optionally a quiver
plot of the corresponding edges in ``e`` on a
``mpl_toolkits.basemap.Basemap`` instance.
The coordinates of the scatter plot are determined by the node's
longitudes and latitudes (in degrees): ``v[lon]`` and ``v[lat]``, where
``lon`` and ``lat`` are column names of ``v`` (the arrow's coordinates
are determined automatically).
In order to map colors to the arrows, either ``C`` or ``C_split_0``
can be be passed, an array of the same length as ``e``. Passing ``C``
creates a single quiver plot (qu). Passing ``C_split_0`` creates two
separate quiver plots, one for all edges where ``C_split_0`` == 0
(qu_0), and one for all other edges (qu). By default, the arrows of
qu_0 have no head, indicating "undirected" edges. This can be useful,
for instance, when ``C_split_0`` represents an array of temporal
distances.
In order to control the parameters of the basemap, scatter, quiver
and/or quiver_0 plots, one may pass keyword arguments by setting
``kwds_basemap``, ``kwds_scatter``, ``kwds_quiver`` and/or
``kwds_quiver_0``.
Can be used iteratively by passing ``ax`` and/or ``m``.
Parameters
----------
lon : int or str
A column name of ``v``. The corresponding values must be longitudes
in degrees.
lat : int or str
A column name of ``v``. The corresponding values must be latitudes
in degrees.
edges : bool, optional (default=True)
Whether to create a quiver plot (2-D field of arrows) of the edges
between the nodes.
C : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. Has no effect if ``C_split_0`` is passed as
an argument.
C_split_0 : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. If this parameter is passed, ``C`` has no
effect, and two separate quiver plots are created (qu and qu_0).
kwds_basemap : dict, optional (default=None)
kwargs passed to basemap.
kwds_scatter : dict, optional (default=None)
kwargs to be passed to scatter.
kwds_quiver : dict, optional (default=None)
kwargs to be passed to quiver (qu).
kwds_quiver_0 : dict, optional (default=None)
kwargs to be passed to quiver (qu_0). Only has an effect if
``C_split_0`` has been set.
ax : matplotlib axes object, optional (default=None)
An axes instance to use.
m : Basemap object, optional (default=None)
A mpl_toolkits.basemap.Basemap instance to use.
Returns
-------
obj : dict
If ``C_split_0`` has been passed, return a dict of matplotlib
objects with the following keys: ['fig', 'ax', 'm', 'pc', 'qu',
'qu_0']. Otherwise, return a dict with keys: ['fig', 'ax', 'm',
'pc', 'qu'].
Notes
-----
When passing ``C_split_0``, the color of the arrows in qu_0 can be set
by passing the keyword argument `color` to ``kwds_quiver_0``. The color
of the arrows in qu, however, are determined by ``C_split_0``.
The default drawing order is set to:
1. quiver_0 (zorder=1)
2. quiver (zorder=2)
3. scatter (zorder=3)
This order can be changed by setting the ``zorder`` in
``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html
See also
--------
plot_map_generator
plot_2d
plot_2d_generator
plot_3d
"""
return self._plot_2d(
is_map=True,
x=lon,
y=lat,
edges=edges,
C=C,
C_split_0=C_split_0,
kwds_basemap=kwds_basemap,
kwds_scatter=kwds_scatter,
kwds_quiver=kwds_quiver,
kwds_quiver_0=kwds_quiver_0,
ax=ax,
m=m,
)
[docs] def plot_map_generator(
self,
lon,
lat,
by,
edges=False,
C=None,
C_split_0=None,
kwds_basemap=None,
kwds_scatter=None,
kwds_quiver=None,
kwds_quiver_0=None,
passable_ax=False,
):
"""Plot nodes and corresponding edges by groups, on basemaps.
Create a generator of scatter plots of the nodes in ``v``, split in
groups by ``v``.groupby(``by``), on a ``mpl_toolkits.basemap.Basemap``
instance. If edges is set True, also create a quiver plot of each
group's corresponding edges.
The coordinates of the scatter plots are determined by the node's
longitudes and latitudes (in degrees): ``v[lon]`` and ``v[lat]``, where
``lon`` and ``lat`` are column names of ``v`` (the arrow's coordinates
are determined automatically).
In order to map colors to the arrows, either ``C`` or ``C_split_0``
can be be passed, an array of the same length as ``e``. Passing ``C``
creates a single quiver plot (qu). Passing ``C_split_0`` creates two
separate quiver plots, one for all edges where ``C_split_0`` == 0
(qu_0), and one for all other edges (qu). By default, the arrows of
qu_0 have no head, indicating "undirected" edges. This can be useful,
for instance, when ``C_split_0`` represents an array of temporal
distances.
When mapping colors to arrows by setting ``C`` (or ``C_split_0``),
`clim` is automatically set to the min and max values of the entire
array. In case one wants clim to be set to min and max values for each
group's colors, one may explicitly pass `clim` = None to
``kwds_quiver``.
The same behaviour occurs when passing a sequence of ``g.n`` Numbers as
colors `c` to ``kwds_scatter``. In that case, `vmin` and `vmax` are
automatically set to `c`.min() and `c`.max() of all nodes. Explicitly
setting `vmin` and `vmax` to `None`, the min and max values of the
groups' color arrays are used.
In order to control the parameters of the basemap, scatter, quiver
and/or quiver_0 plots, one may pass keyword arguments by setting
``kwds_basemap``, ``kwds_scatter``, ``kwds_quiver`` and/or
``kwds_quiver_0``.
If ``passable_ax`` is True, create a generator of functions. Each
function takes a matplotlib axes object (and/or a Basemap object) as
input, and returns a scatter/quiver plot.
Parameters
----------
lon : int or str
A column name of ``v``. The corresponding values must be longitudes
in degrees.
lat : int or str
A column name of ``v``. The corresponding values must be latitudes
in degrees.
by : array_like
Column name(s) of ``v``, determining the groups to create plots of.
edges : bool, optional (default=True)
Whether to create a quiver plot (2-D field of arrows) of the edges
between the nodes.
C : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. Has no effect if ``C_split_0`` is passed as
an argument.
C_split_0 : array_like, optional (default=None)
An optional array used to map colors to the arrows. Must have the
same length es ``e``. If this parameter is passed, ``C`` has no
effect, and two separate quiver plots are created (qu and qu_0).
kwds_basemap : dict, optional (default=None)
kwargs passed to basemap.
kwds_scatter : dict, optional (default=None)
kwargs to be passed to scatter.
kwds_quiver : dict, optional (default=None)
kwargs to be passed to quiver (qu).
kwds_quiver_0 : dict, optional (default=None)
kwargs to be passed to quiver (qu_0). Only has an effect if
``C_split_0`` has been set.
passable_ax : bool, optional (default=False)
If True, return a generator of functions. Each function takes a
matplotlib axes object (and/or a Basemap object) as input, and
returns a dict of matplotlib objects.
Returns
-------
obj : generator
If ``C_split_0`` has been passed, return a generator of dicts of
matplotlib objects with the following keys: ['fig', 'ax', 'm',
'pc', 'qu', 'qu_0', 'group']. Otherwise, return a generator of
dicts with keys: ['fig', 'ax', 'm', 'pc', 'qu', 'group'].
If ``passable_ax`` is True, return a generator of functions. Each
function takes a matplotlib axes object (and/or a Basemap object)
as input, and returns a dict as described above.
Notes
-----
When passing ``C_split_0``, the color of the arrows in qu_0 can be set
by passing the keyword argument `color` to ``kwds_quiver_0``. The color
of the arrows in qu, however, are determined by ``C_split_0``.
The default drawing order is set to:
1. quiver_0 (zorder=1)
2. quiver (zorder=2)
3. scatter (zorder=3)
This order can be changed by setting the ``zorder`` in
``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html
See also
--------
append_binning_labels_v
plot_map
plot_2d
plot_2d_generator
plot_3d
"""
return self._plot_2d_generator(
is_map=True,
x=lon,
y=lat,
by=by,
edges=edges,
C=C,
C_split_0=C_split_0,
kwds_basemap=kwds_basemap,
kwds_scatter=kwds_scatter,
kwds_quiver=kwds_quiver,
kwds_quiver_0=kwds_quiver_0,
passable_ax=passable_ax,
)
def plot_3d(self, x, y, z, edges=False, kwds_scatter=None, kwds_quiver=None, ax=None):
"""Work in progress!
experimental, quiver3D scaling?
See also
--------
plot_2d
plot_2d_generator
plot_map
plot_map_generator
"""
# set kwds
if kwds_scatter is None:
kwds_scatter = {}
if kwds_quiver is None:
kwds_quiver = {}
from mpl_toolkits.mplot3d.axes3d import Axes3D # @UnusedImport
# return dict of matplotlib objects
obj = {}
# create figure and axes
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
else:
fig = ax.get_figure()
# create PathCollection by scatter
x, y, z = (self.v[x], self.v[y], self.v[z])
pc = plt.scatter(x, y, zs=z, zdir="z", **kwds_scatter)
obj["pc"] = pc
# draw edges as arrows
if edges is True:
# get unique indices of edgeed nodes
s = self.e.index.get_level_values(level=0).values
t = self.e.index.get_level_values(level=1).values
# xy position of sources, delta xy
xs, ys, zs = (x.loc[s].values, y.loc[s].values, z.loc[s].values)
xt, yt, zt = (x.loc[t].values, y.loc[t].values, z.loc[t].values)
# upcast dtypes
xs = np.array(xs, dtype=float)
ys = np.array(ys, dtype=float)
zs = np.array(zs, dtype=float)
xt = np.array(xt, dtype=float)
yt = np.array(yt, dtype=float)
zt = np.array(zs, dtype=float)
dx = xt - xs
dy = yt - ys
dz = zt - zs
qu = ax.quiver(xs, ys, zs, dx, dy, dz, **kwds_quiver)
obj["qu"] = qu
return obj
def plot_rects_label_numeric(self, label, xl, xr, colors=None, ax=None, **kwargs):
"""Work in progress!
Plot rectangles given by `label_xl_xr_df`.
Parameters
----------
label_xl_xr_df : pd.DataFrame
A pandas.DataFrame object with three columns, the first column
containing the categorical variable (labels),
the second column containing the left x values, the
third column the right x values of the boxes.
kwargs : keywords
kwargs to pass to matplotlib.pyplot.vlines
Returns
-------
obj : dict of matplotlib objects
Keys are ['fig', 'ax', 'vlines']
See also
--------
plot_rects_numeric_numeric
"""
from matplotlib.collections import PolyCollection
v = self.v[[label, xl, xr]]
# return dict of matplotlib objects
obj = {}
# create figure and axes
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
obj["fig"] = fig
obj["ax"] = ax
# include colors in dataframe for sorting
if colors is not None:
v["color"] = colors
else:
v["color"] = 1
# rectangle coordinates
xl, xr = (v[xl].values, v[xr].values)
widths = xr - xl
yb = v[label] - 0.6
heights = np.ones(len(xr)) * 1.2
recs = []
for x, y, width, height in zip(xl, yb, widths, heights):
recs.append(((x, y), (x, y + height), (x + width, y + height), (x + width, y)))
# create poly collection of rectangles
c = PolyCollection(recs, **kwargs)
# set colors
c.set_array(v["color"])
obj["c"] = c
# add PolyCollection
ax.add_collection(c)
# set yticklabels
positions = np.arange(v[label].max() + 1)
ax.set_yticks(positions)
# set x/y lims
dx = 0.05 * (xr.max() - xl.min())
dy = 0.05 * (yb.max() + 1.2 - yb.min())
ax.set_xlim((xl.min() - dx, xr.max() + dx))
ax.set_ylim((yb.min() - dy, yb.max() + 1.2 + dy))
return obj
def plot_rects_numeric_numeric(self, yb, yt, xl, xr, colors=None, ax=None, **kwargs):
"""Work in progress!
Create a raster plot of all components given by `yb_yt_xl_xr_df`.
Parameters
----------
yb_yt_xl_xr_df : pd.DataFrame
A pandas.DataFrame object with four columns
kwargs : keywords
kwargs to pass to matplotlib.pyplot.vlines
Returns
-------
obj : dict of matplotlib objects
Keys are ['fig', 'ax', 'vlines']
See also
--------
box_label_numeric
"""
from matplotlib.collections import PolyCollection
v = self.v[[yb, yt, xl, xr]]
# return dict of matplotlib objects
obj = {}
# create figure and axes
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
obj["fig"] = fig
obj["ax"] = ax
# include colors in dataframe for sorting
if colors is not None:
v["color"] = colors
else:
v["color"] = 1
# rectangle coordinates
xl, xr = (v[xl].values, v[xr].values)
widths = xr - xl
yb, yt = (v[yb], v[yt])
heights = yt - yb
recs = []
for x, y, width, height in zip(xl, yb, widths, heights):
recs.append(((x, y), (x, y + height), (x + width, y + height), (x + width, y)))
# create poly collection of rectangles
c = PolyCollection(recs, **kwargs)
# set colors
c.set_array(v["color"])
obj["c"] = c
# add PolyCollection
ax.add_collection(c)
# set x/y lims
dx = 0.05 * (xr.max() - xl.min())
dy = 0.05 * (yt.max() - yb.min())
ax.set_xlim((xl.min() - dx, xr.max() + dx))
ax.set_ylim((yb.min() - dy, yt.max() + dy))
return obj
def plot_raster(self, label, time="time", ax=None, **kwargs):
"""Work in progress!
Create a raster plot of all nodes given by `supernode_id_time_df`.
Parameters
----------
supernode_id_time_df : pd.DataFrame
A pandas.DataFrame object with two columns, the first column
containing the labels, the second column containing the
times of the nodes.
kwargs : keywords
kwargs to pass to matplotlib.pyplot.vlines
Returns
-------
obj : dict of matplotlib objects
Keys are ['fig', 'ax', 'vlines']
"""
# return dict of matplotlib objects
obj = {}
# create figure and axes
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
obj["fig"] = fig
obj["ax"] = ax
# sort by labels
v = self.v[[label, time]].sort_values(label)
# unique labels
labels = v[label].unique()
# create raster plot
vlines = []
for i, l in enumerate(labels):
vlines.append(ax.vlines(v[v[label] == l][time].values, i + 0.5, i + 1.5, **kwargs))
obj["vlines"] = vlines
# set labels as yticklabels
positions = np.arange(1, len(labels) + 1)
labels = labels
ax.set_yticks(positions)
ax.set_yticklabels(labels)
# set x/y lims
dx = 0.05 * (v[time].max() - v[time].min())
dy = 0.05 * (positions.max() - positions.min())
ax.set_xlim((v[time].min() - dx, v[time].max() + dx))
ax.set_ylim((positions.min() - dy, positions.max() + dy))
# set x/y label
ax.set_xlabel("time")
ax.set_ylabel(label)
return obj
[docs] @staticmethod
def plot_hist(x, bins=10, log_bins=False, density=False, floor=False, ax=None, **kwargs):
"""Plot a histogram (or pdf) of x.
Compute and plot the histogram (or probability density) of x. Keyword
arguments are passed to plt.plot. See parameters and ``np.histogram``
for details.
Parameters
----------
x : array_like
The data from which a frequency distribution is plot.
bins : int or array_like, optional (default=10)
If ``bins`` is an int, it determines the number of bins to create.
If ``log_bins`` is True, this number determines the (approximate)
number of bins to create for each magnitude. For linear bins, it is
the number of bins for the whole range of values. If ``bins`` is a
sequence, it defines the bin edges, including the rightmost edge,
allowing for non-uniform bin widths.
log_bins : bool, optional (default=False)
Whether to use logarithmically or linearly spaced bins.
density : bool, optional (default=False)
If False, the result will contain the number of samples in each
bin. If True, the result is the value of the probability *density*
function at the bin, normalized such that the *integral* over the
range is 1. Note that the sum of the histogram values will not be
equal to 1 unless bins of unity width are chosen; it is not a
probability *mass* function.
floor : bool, optional (default=False)
Whether to floor the bin edges to the closest integers. Only has an
effect if ``bins`` is an int.
ax : matplotlib axes object, optional (default=None)
An axes instance to use.
Returns
-------
ax : matplotlib axes object
A matplotlib axes instance.
hist : np.ndarray
The values of the histogram. See ``density``.
bin_edges : np.ndarray
The edges of the bins.
"""
# create bins
if _is_array_like(bins):
bin_edges = bins
else:
bin_edges = _create_bin_edges(x, bins, log_bins, floor)
# counts and bin_centers
hist, _ = np.histogram(x, bin_edges, density=density)
hist = hist.astype(float)
hist[hist == 0] = np.nan
bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2.0
# plot
if ax is None:
_, ax = plt.subplots()
ax.plot(bin_centers, hist, **kwargs)
# set scales
if log_bins:
ax.set_xscale("log")
return ax, hist, bin_edges
[docs] @staticmethod
def plot_logfile(logfile):
"""Plot a logfile.
Plot a benchmark logfile created by ``create_edges`` or
``create_edges_ft``.
Parameters
----------
logfile : str
The filename of the logfile.
Returns
-------
obj : dict
Depending on the logfile, return a dict of matplotlib objects with
a subset of the following keys: ['fig', 'ax', 'pc_n', 'pc_e',
'cb_n', 'cb_e']
"""
# load data from log file
logfile = np.loadtxt(logfile)
# return dict of matplotlib objects
obj = {}
# 0 1 2 3
# exceeded | nr.of pairs | nr.of edges | comp.time
# partition by non-/exceeded max_pairs
log_n = logfile[logfile[:, 0] == 0]
log_e = logfile[logfile[:, 0] == 1]
fig, ax = plt.subplots()
obj["fig"] = fig
obj["ax"] = ax
# scatter normal iterations
pc_n = ax.scatter(
log_n[:, 1], log_n[:, 3], s=20, c=np.log10(log_n[:, 2] + 1), marker="o", label="normal", edgecolors="none"
)
obj["pc_n"] = pc_n
# scatter max_pair exceeded iterations
pc_e = ax.scatter(
log_e[:, 1],
log_e[:, 3],
s=30,
c=np.log10(log_e[:, 2] + 1),
cmap="gist_earth",
marker="D",
label="max_pairs exceeded",
)
obj["pc_e"] = pc_e
msg = "iterations: {:d} | total time: {:.2f}s | total edges: {:d}"
ax.set_title(msg.format(len(logfile), logfile[:, 3].sum(), int(logfile[:, 2].sum())))
ax.set_xlabel("nr.of pairs")
ax.set_ylabel("comp.time (s)")
ax.set_xscale("log")
ax.legend(loc=2)
ax.grid()
if len(log_e) == 0:
cb_n = fig.colorbar(pc_n, fraction=0.03)
cb_n.set_label("log10(n_edges) (normal)")
fig.tight_layout()
obj["cb_n"] = cb_n
elif len(log_n) == 0:
cb_e = fig.colorbar(pc_e, fraction=0.03)
cb_e.set_label("log10(n_edges) (exceeded)")
fig.tight_layout()
obj["cb_e"] = cb_e
else:
cb_e = fig.colorbar(pc_e, fraction=0.03)
cb_n = fig.colorbar(pc_n, fraction=0.03)
cb_n.set_label("log10(n_edges) (normal)")
cb_e.set_label("log10(n_edges) (exceeded)")
fig.tight_layout()
obj["cb_n"] = cb_n
obj["cb_e"] = cb_e
return obj
@property
def n(self):
"""The number of nodes"""
if hasattr(self, "v"):
if isinstance(self.v, pd.HDFStore):
if len(self.v.keys()) == 1:
n = self.v.get_storer(self.v.keys()[0]).nrows
else:
n = "NA"
else:
n = len(self.v)
else:
n = 0
return n
@property
def m(self):
"""The number of edges"""
if hasattr(self, "e"):
m = len(self.e)
else:
m = 0
return m
@property
def f(self):
"""Types of features and number of features of corresponding type."""
if hasattr(self, "v"):
if isinstance(self.v, pd.HDFStore):
f = "NA"
else:
f = self.v.count()
else:
f = "there are no nodes"
return f
@property
def r(self):
"""Types of relations and number of relations of corresponding type."""
if hasattr(self, "e"):
r = self.e.count()
else:
r = "there are no edges"
return r
def _plot_2d(
self, is_map, x, y, edges, C, C_split_0, kwds_scatter, kwds_quiver, kwds_quiver_0, kwds_basemap, ax, m
):
if is_map:
from mpl_toolkits.basemap import Basemap
# set kwds
if kwds_basemap is None:
kwds_basemap = {}
else:
kwds_basemap = kwds_basemap.copy()
if kwds_scatter is None:
kwds_scatter = {}
else:
kwds_scatter = kwds_scatter.copy()
if kwds_quiver is None:
kwds_quiver = {}
else:
kwds_quiver = kwds_quiver.copy()
if kwds_quiver_0 is None:
kwds_quiver_0 = {}
else:
kwds_quiver_0 = kwds_quiver_0.copy()
# set draw order
try:
zorder_qu0 = kwds_quiver_0.pop("zorder")
except KeyError:
zorder_qu0 = 1
try:
zorder_qu = kwds_quiver.pop("zorder")
except KeyError:
zorder_qu = 2
try:
zorder_pc = kwds_scatter.pop("zorder")
except KeyError:
zorder_pc = 3
# create dict for matplotlib objects
obj = {}
# create figure, axes (and basemap)
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
obj["fig"] = fig
obj["ax"] = ax
if is_map and m is None:
m = Basemap(ax=ax, **kwds_basemap)
obj["m"] = m
elif is_map and m is not None:
obj["m"] = m
# create PathCollection by scatter
x_str = x
y_str = y
x, y = (self.v[x_str].values, self.v[y_str].values)
if is_map:
axm = m
x, y = m(x, y)
# bug in basemap, it changed dtypes
x = np.array(x, dtype=float)
y = np.array(y, dtype=float)
else:
axm = ax
pc = axm.scatter(x, y, zorder=zorder_pc, **kwds_scatter)
obj["pc"] = pc
# draw edges as arrows
if edges is True:
# source- and target-indices
s = self.e.index.get_level_values(level=0).values
t = self.e.index.get_level_values(level=1).values
# latlon position of sources and targets, vector components
x, y = (self.v[x_str], self.v[y_str])
if is_map:
xs, ys = m(x.loc[s].values, y.loc[s].values)
xt, yt = m(x.loc[t].values, y.loc[t].values)
else:
xs, ys = (x.loc[s].values, y.loc[s].values)
xt, yt = (x.loc[t].values, y.loc[t].values)
# upcast dtypes
xs = np.array(xs, dtype=float)
ys = np.array(ys, dtype=float)
xt = np.array(xt, dtype=float)
yt = np.array(yt, dtype=float)
dx = xt - xs
dy = yt - ys
# bug in basemap, changed dtypes
if is_map:
dx = np.array(dx, dtype=float)
dy = np.array(dy, dtype=float)
# create quiver plot
if C_split_0 is not None:
try:
color = kwds_quiver_0.pop("color")
except KeyError:
color = "k"
try:
headwidth = kwds_quiver_0.pop("headwidth")
except KeyError:
headwidth = 1
C = C_split_0
qu_0 = axm.quiver(
xs[C == 0],
ys[C == 0],
dx[C == 0],
dy[C == 0],
color=color,
angles="xy",
scale_units="xy",
scale=1,
headwidth=headwidth,
zorder=zorder_qu0,
**kwds_quiver_0,
)
qu = axm.quiver(
xs[C != 0],
ys[C != 0],
dx[C != 0],
dy[C != 0],
C[C != 0],
angles="xy",
scale_units="xy",
scale=1,
zorder=zorder_qu,
**kwds_quiver,
)
obj["qu_0"] = qu_0
obj["qu"] = qu
elif C is not None:
qu = axm.quiver(
xs, ys, dx, dy, C, angles="xy", scale_units="xy", scale=1, zorder=zorder_qu, **kwds_quiver
)
obj["qu"] = qu
else:
qu = axm.quiver(xs, ys, dx, dy, angles="xy", scale_units="xy", scale=1, zorder=zorder_qu, **kwds_quiver)
obj["qu"] = qu
return obj
def _plot_2d_generator(
self, is_map, x, y, by, edges, C, C_split_0, kwds_basemap, kwds_scatter, kwds_quiver, kwds_quiver_0, passable_ax
):
if is_map:
from mpl_toolkits.basemap import Basemap
# set kwargs
if kwds_basemap is None:
kwds_basemap = {}
else:
kwds_basemap = kwds_basemap.copy()
if kwds_scatter is None:
kwds_scatter = {}
else:
kwds_scatter = kwds_scatter.copy()
if kwds_quiver is None:
kwds_quiver = {}
else:
kwds_quiver = kwds_quiver.copy()
if kwds_quiver_0 is None:
kwds_quiver_0 = {}
else:
kwds_quiver_0 = kwds_quiver_0.copy()
# set draw order
try:
zorder_qu0 = kwds_quiver_0.pop("zorder")
except KeyError:
zorder_qu0 = 1
try:
zorder_qu = kwds_quiver.pop("zorder")
except KeyError:
zorder_qu = 2
try:
zorder_pc = kwds_scatter.pop("zorder")
except KeyError:
zorder_pc = 3
# assert there's no color given in quiver kwds
if kwds_quiver is not None:
assert "color" not in kwds_quiver.keys(), "use 'C' or 'C_split_0' for setting the color of quiver!"
# select v
v = self.v[_flatten([x, y, by])]
# store array_like kwargs in dataframe for filtering
# and change standard kwargs
# set xlim/ylim for non map plots
if not is_map:
dx = (v[x].max() - v[x].min()) * 0.05
dy = (v[y].max() - v[y].min()) * 0.05
xlim = (v[x].min() - dx, v[x].max() + dx)
ylim = (v[y].min() - dy, v[y].max() + dy)
# scatter size
try:
pc_s = kwds_scatter.pop("s")
v["pc_s"] = pc_s
except KeyError:
v["pc_s"] = 20
# scatter color
try:
pc_c = kwds_scatter.pop("c")
v["pc_c"] = pc_c
except KeyError:
pc_c = None
v["pc_c"] = 1
# scatter vmin/vmax -> entire min/max
try:
pc_vmin = kwds_scatter.pop("vmin")
except KeyError:
if pc_c is not None:
try:
pc_vmin = pc_c.min()
except AttributeError:
pc_vmin = None
else:
pc_vmin = None
try:
pc_vmax = kwds_scatter.pop("vmax")
except KeyError:
if pc_c is not None:
try:
pc_vmax = pc_c.max()
except AttributeError:
pc_vmax = None
else:
pc_vmax = None
# quiver colors, and quiver clim -> entire min/max
if edges is True:
if C_split_0 is not None:
e = pd.DataFrame(data={"Cqu0": C_split_0}, index=self.e.index)
try:
qu_clim = kwds_quiver.pop("clim")
except KeyError:
qu_clim = [C_split_0.min(), C_split_0.max()]
elif C is not None:
e = pd.DataFrame(data={"C": C}, index=self.e.index)
try:
qu_clim = kwds_quiver.pop("clim")
except KeyError:
qu_clim = [C.min(), C.max()]
else:
e = pd.DataFrame(index=self.e.index)
qu_clim = None
# change standard kwargs for quiver_0 at [C_split_0 == 0]
try:
color = kwds_quiver_0.pop("color")
except KeyError:
color = "k"
try:
qu_0_headwidth = kwds_quiver_0.pop("headwidth")
except KeyError:
qu_0_headwidth = 1
else:
e = None
# generator loop
x_str = x
y_str = y
gv = v.groupby(by)
for labels, group in gv:
def obj(ax=None, m=None):
"""Plot nodes and corresponding edges.
See ``plot_2d_generator`` or ``plot_map_generator`` for
details.
Parameters
----------
ax : matplotlib axes object, optional (default=None)
An axes instance to use.
m : Basemap object, optional (default=None)
A mpl_toolkits.basemap.Basemap instance to use.
Returns
-------
obj : dict
Return a dict of matplotlib objects.
"""
# store group labels in obj
obj = {"group": labels}
# filter edges by group
g = DeepGraph(group, e)
g.update_edges()
# create figure, axes (and basemap)
if ax is None:
fig, ax = plt.subplots()
else:
fig = ax.get_figure()
obj["fig"] = fig
obj["ax"] = ax
if is_map and m is None:
m = Basemap(ax=ax, **kwds_basemap.copy())
obj["m"] = m
elif is_map and m is not None:
obj["m"] = m
else:
ax.set_xlim(xlim)
ax.set_ylim(ylim)
# create PathCollection by scatter
x, y = (g.v[x_str].values, g.v[y_str].values)
if is_map:
axm = m
x, y = m(x, y)
else:
axm = ax
# need to change colors to list, in case they're not numbers
pc = axm.scatter(
x,
y,
c=g.v.pc_c.values.tolist(),
s=g.v.pc_s.values,
vmin=pc_vmin,
vmax=pc_vmax,
zorder=zorder_pc,
**kwds_scatter,
)
obj["pc"] = pc
# draw edges as arrows
if edges is True:
# source- and target-indices
s = g.e.index.get_level_values(level=0).values
t = g.e.index.get_level_values(level=1).values
# xy position of sources and targets, vector components
x, y = (g.v[x_str], g.v[y_str])
if is_map:
xs, ys = m(x.loc[s].values, y.loc[s].values)
xt, yt = m(x.loc[t].values, y.loc[t].values)
else:
xs, ys = (x.loc[s].values, y.loc[s].values)
xt, yt = (x.loc[t].values, y.loc[t].values)
# upcast dtypes
xs = np.array(xs, dtype=float)
ys = np.array(ys, dtype=float)
xt = np.array(xt, dtype=float)
yt = np.array(yt, dtype=float)
dx = xt - xs
dy = yt - ys
# bug in basemap, changes dtypes
if is_map:
dx = np.array(dx, dtype=float)
dy = np.array(dy, dtype=float)
if C_split_0 is not None:
C = g.e.Cqu0.values
qu_0 = axm.quiver(
xs[C == 0],
ys[C == 0],
dx[C == 0],
dy[C == 0],
color=color,
angles="xy",
scale_units="xy",
scale=1,
headwidth=qu_0_headwidth,
zorder=zorder_qu0,
**kwds_quiver_0,
)
qu = axm.quiver(
xs[C != 0],
ys[C != 0],
dx[C != 0],
dy[C != 0],
C[C != 0],
angles="xy",
scale_units="xy",
scale=1,
clim=qu_clim,
zorder=zorder_qu,
**kwds_quiver,
)
obj["qu_0"] = qu_0
obj["qu"] = qu
elif C is not None:
C = g.e.C.values
qu = axm.quiver(
xs,
ys,
dx,
dy,
C,
angles="xy",
scale_units="xy",
scale=1,
clim=qu_clim,
zorder=zorder_qu,
**kwds_quiver,
)
obj["qu"] = qu
else:
qu = axm.quiver(
xs, ys, dx, dy, angles="xy", scale_units="xy", scale=1, zorder=zorder_qu, **kwds_quiver
)
obj["qu"] = qu
return obj
if passable_ax:
yield obj
else:
yield obj()