Source code for deepgraph.deepgraph

"""The core module of DeepGraph (dg).

This module contains the core class ``dg.DeepGraph`` providing the means
to construct, manipulate and partition graphs, and offering interfacing
methods to common network representations and popular Python network
packages. This class also provides plotting methods to visualize graphs
and their properties and to benchmark the graph construction parameters.

For further information type

>>> help(dg.DeepGraph)

"""
import inspect

# Copyright (C) 2017-2023 by
# Dominik Traxl <dominik.traxl@posteo.org>
# All rights reserved.
# BSD license.


import os
from datetime import datetime
from itertools import chain

from deepgraph.iterators_and_indexers import (
    _matrix_iterator,
    _ft_iterator,
    _iter_edges,
    _initiate_create_edges,
    _aggregate_super_table,
)
from deepgraph.utils import _is_array_like, _dic_translator, _create_bin_edges, _flatten

try:
    import matplotlib as mpl

    display = "DISPLAY" in os.environ
    if not display:
        mpl.use("Agg")
    import matplotlib.pyplot as plt
except ImportError:
    mpl = None
    plt = None

import numpy as np
import pandas as pd

# get rid of false positive SettingWithCopyWarnings, see
# http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning
pd.options.mode.chained_assignment = None


[docs]class DeepGraph:
    """The core class of DeepGraph (dg).

    This class encapsulates the graph representation as ``pandas.DataFrame``
    objects in its attributes ``v`` and ``e``. It can be initialized with a
    node table ``v``, whose rows represent the nodes of the graph, as well
    as an edge table ``e``, whose rows represent edges between the nodes.

    Given a node table ``v``, it provides methods to iteratively compute
    pairwise relations between the nodes using arbitrary, user-defined
    functions. These methods provide arguments to parallelize the
    computation and control memory consumption (see ``create_edges`` and
    ``create_edges_ft``).

    Also provides methods to partition nodes, edges or an entire graph by
    the graph's properties and labels, and to create common network
    representations and graph objects of popular Python network packages.

    Furthermore, it provides methods to visualize graphs and their
    properties and to benchmark the graph construction parameters.

    Optionally, the convenience parameter ``supernode_labels_by`` can be
    passed, creating supernode labels by enumerating all distinct (tuples
    of) values of a (multiple) column(s) of ``v`` . Superedge labels can be
    created analogously, by passing the parameter ``superedge_labels_by``.

    Parameters
    ----------
    v : pandas.DataFrame or pandas.HDFStore, optional (default=None)
        The node table, a table representation of the nodes of a graph. The
        index of ``v`` must be unique and represents the node indices. The
        column names of ``v`` represent the types of features of the nodes,
        and each cell represents a feature of a node. Only a reference to
        the input DataFrame is created, not a copy. May also be a
        ``pandas.HDFStore``, but only ``create_edges`` and
        ``create_edges_ft`` may then be used (so far).

    e : pandas.DataFrame, optional (default=None)
        The edge table, a table representation of the edges between the
        nodes given by ``v``. Its index has to be a
        ``pandas.core.index.MultiIndex``, whose first level contains the
        indices of the source nodes, and the second level contains the
        indices of the target nodes. Each row of ``e`` represents an edge,
        column names of ``e`` represent the types of relations of the edges,
        and each cell in ``e`` represents a relation of an edge. Only a
        reference to the input DataFrame is created, not a copy.

    supernode_labels_by : dict, optional (default=None)
        A dictionary whose keys are strings and their values are (lists of)
        column names of ``v``. Appends a column to ``v`` for each key, whose
        values correspond to supernode labels, enumerating all distinct
        (tuples of) values of the column(s) given by the dict's value.

    superedge_labels_by : dict, optional (default=None)
        A dictionary whose keys are strings and their values are (lists of)
        column names of ``e``. Appends a column to ``e`` for each key, whose
        values correspond to superedge labels enumerating all distinct
        (tuples of) values of the column(s) given by the dict's value.

    Attributes
    ----------
    v : pandas.DataFrame
        See Parameters.

    e : pandas.DataFrame
        See Parameters.

    n : int
        Property: Number of nodes.

    m : int
        Property: Number of edges.

    f : pd.DataFrame
        Property: types of features and number of features of corresponding
        type.

    r : pd.DataFrame
        Property: types of relations and number of relations of
        corresponding type.

    """

[docs]    def __init__(self, v=None, e=None, supernode_labels_by=None, superedge_labels_by=None):
        # create supernode labels by common features
        if supernode_labels_by is not None:
            for key, value in supernode_labels_by.items():
                v[key] = v.groupby(value).grouper.group_info[0]

        # create superedge labels by common relations
        if superedge_labels_by is not None:
            for key, value in superedge_labels_by.items():
                e[key] = e.groupby(value).grouper.group_info[0]

        # assert v input, set as class attribute
        if v is not None:
            # assert (isinstance(v, pd.DataFrame) or
            #         isinstance(v, pd.HDFStore)), (
            #     "v has to be <type 'pd.DataFrame'> "
            #     "or <type 'pd.HDFStore'>, not {}".format(type(v)))
            self.v = v

        # assert e input, set as class attribute
        if e is not None:
            # assert isinstance(e, pd.DataFrame), (
            #     "e has to be <type 'pd.DataFrame'>, not {}".format(type(e)))
            self.e = e

    def __repr__(self):
        msg = "<{} object, with n={} node(s) and m={} edge(s) at 0x{:02x}>"
        return msg.format(type(self).__name__, self.n, self.m, id(self))

    def __str__(self):
        msg = "<{} object, with n={} node(s) and m={} edge(s) at 0x{:02x}>"
        return msg.format(type(self).__name__, self.n, self.m, id(self))

[docs]    def create_edges(
        self,
        connectors=None,
        selectors=None,
        transfer_features=None,
        r_dtype_dic=None,
        no_transfer_rs=None,
        step_size=int(1e7),
        from_pos=0,
        to_pos=None,
        hdf_key=None,
        verbose=False,
        logfile=None,
    ):
        """Create an edge table ``e`` linking the nodes in ``v``.

        This method enables an iterative computation of pairwise relations
        (edges) between the nodes represented by ``v``. It does so in a
        flexible, efficient and vectorized fashion, easily parallelizable and
        with full control over RAM usage.

        1. Connectors

        The simplest use-case is to define a single connector function
        acting on a single column of the node table ``v``. For instance, given
        a node table ``v``

        >>> import pandas as pd
        >>> import deepgraph as dg
        >>> v = pd.DataFrame({'time': [0.,2.,9.], 'x': [3.,1.,12.]})
        >>> g = dg.DeepGraph(v)

        >>> g.v
           time   x
        0     0   3
        1     2   1
        2     9  12

        one may define a function

        >>> def time_difference(time_s, time_t):
        ...     dt = time_t - time_s
        ...     return dt

        and pass it to ``create_edges``, in order to compute the time
        difference of each pair of nodes

        >>> g.create_edges(connectors=time_difference)

        >>> g.e
             dt
        s t
        0 1   2
          2   9
        1 2   7

        As one can see, the connector function takes column names of ``v`` with
        additional '_s' and '_t' endings (indicating source node values and
        target node values, respectively) as input, and returns a variable with
        the computed values. The resulting edge table ``g.e`` is indexed by the
        node indices ('s' and 't', representing source and target node indices,
        respectively), and has one column ('dt', the name of the returned
        variable) with the computed values of the given connector. Note that
        only the upper triangle adjacency matrix is computed, which is always
        the case. See Notes for further information.

        One may also pass a list of functions to ``connectors``, which are then
        computed in the list's order. Generally, a connector function can take
        multiple column names of ``v`` (with '_s' and/or '_t' appended) as
        input, as well as already computed relations of former connectors.
        Also, any connector function may have multiple output variables. Every
        output variable has to be a 1-dimensional ``np.ndarray`` (with
        arbitrary dtype, including ``object``). The return statement may not
        contain any operators, only references to each computed relation.

        For instance, considering the above example, one may define an
        additional connector

        >>> def velocity(dt, x_s, x_t):
        ...     dx = x_t - x_s
        ...     v = dx / dt
        ...     return v, dx

        and then apply both connectors on ``v``, resulting in

        >>> g.create_edges(connectors=[time_difference, velocity])

        >>> g.e
             dt  dx         v
        s t
        0 1   2  -2 -1.000000
          2   9   9  1.000000
        1 2   7  11  1.571429

        2. Selectors

        However, one is often only interested in a subset of all possible
        edges. In order to select edges during the iteration process - based on
        some conditions on the node's features and their computed relations -
        one may pass a (list of) selector function(s) to ``create_edges``. For
        instance, given the above example, one may define a selector

        >>> def dt_thresh(dt, sources, targets):
        ...     sources = sources[dt > 5]
        ...     targets = targets[dt > 5]
        ...     return sources, targets

        and apply it in conjunction with the ``time_difference`` connector

        >>> g.create_edges(connectors=time_difference, selectors=dt_thresh)

        >>> g.e
             dt
        s t
        0 2   9
        1 2   7

        leaving only edges with a time difference larger than 5.

        Every selector function must have ``sources`` and ``targets`` as input
        arguments as well as in the return statement. Most generally, they may
        depend on column names of ``v`` (with '_s' and/or '_t' appended) and/or
        computed relations of connector functions, and/or computed relations of
        former selector functions. Apart from ``sources`` and ``targets``, they
        may additionally return computed relations. Given this input/output
        flexibility of selectors, one could in fact compute all required
        relations, and select any desired subset of edges, with a single
        selector function. The purpose of splitting connectors and/or
        selectors, however, is to control the iteration's performance by
        consecutively computing relations and selecting edges: **hierarchical
        selection**.

        3. Hierarchical Selection

        As the algorithm iterates through the chunks of all possible source and
        target node indices ([0, g.n*(g.n-1)/2]), it goes through the list of
        ``selectors`` at each step. If a selector has a relation as input, it
        must have either been computed by a former selector, or the selector
        requests its computation by the corresponding connector function in
        ``connectors`` (this connector may not depend on any other not yet
        computed relations). Once the input relations are computed (if
        requested), the selector is applied and returns updated indices, which
        are then passed to the next selector. Hence, with each selector, the
        indices are reduced and consecutive computation of relations only
        consider the remaining indices. After all selectors have been applied,
        the connector functions that have not been requested by any selector
        are computed (on the final, reduced chunk of node and target indices).

        4. Transferring Features

        The argument ``transfer_features``, which takes a (list of) column
        name(s) of ``v``, makes it possible to transfer features of ``v`` to
        the created edge table ``e``

        >>> g.create_edges(connectors=time_difference,
        ...                transfer_features=['x', 'time'])

        >>> g.e
             dt  time_s  time_t  x_s  x_t
        s t
        0 1   2       0       2    3    1
          2   9       0       9    3   12
        1 2   7       2       9    1   12

        If computation time and memory consumption are of no concern, one might
        skip the remaing paragraphs.

        5. Logging

        Clearly, the order of the hierarchical selection as described in 3.
        influences the computation's efficiency. The complexity of a relation's
        computation and the (expected average) number of deleted edges of a
        selector should be considered primarily. In order to track and
        benchmark the iteration process, the progress and time measurements are
        printed for each iteration step, if ``verbose`` is set to True.
        Furthermore, one may create a logfile (which can also be plot by
        ``dg.DeepGraph.plot_logfile``) by setting the argument ``logfile`` to a
        string, indicating the file name of the created logfile.

        6. Parallelization and Memory Control

        The arguments ``from_pos``, ``to_pos`` and ``step_size`` control the
        range of processed pairs of nodes and the number of pairs of nodes to
        process at each iteration step. They may be used for parallel
        computation and to control RAM usage. See Parameters for details.

        It is also possible to initiate ``dg.DeepGraph`` with a
        ``pandas.HDFStore`` containing the DataFrame representing the node
        table. Only the data requested by ``transfer_features`` and the user-
        defined ``connectors`` and ``selectors`` at each iteration step is then
        pulled from the store, which is particularly useful for large node
        tables and parallel computation. The only requirement is that the node
        table contained in the store is in table(t) format, not fixed(f)
        format. For instance, considering the above created node table, one may
        store it in a hdf file

        >>> vstore = pd.HDFStore('vstore.h5')
        >>> vstore.put('node_table', v, format='t', index=False)

        initiate a DeepGraph instance with the store

        >>> g = dg.DeepGraph(vstore)

        >>> g.v
        <class 'pandas.io.pytables.HDFStore'>
        File path: vstore.h5
        /node_table            frame_table  (typ->appendable,nrows->3,ncols->2,
        indexers->[index])

        and then create edges the same way as if ``g.v`` were a DataFrame

        >>> g.create_edges(connectors=time_difference)

        >>> g.e
             dt
        s t
        0 1   2
          2   9
        1 2   7

        In case the store has multiple nodes, ``hdf_key`` has to be set to the
        node corresponding to the node table of the graph.

        Also, one may pass a (list of) name(s) of computed relations,
        ``no_transfer_rs``, which should not be transferred to the created edge
        table ``e``. This can be advantageous, for instance, if a selector
        depends on computed relations that are of no further interest.

        Furthermore, it is possible to force the dtype of computed relations
        with the argument ``r_dtype_dic``. The dtype of a relation is then set
        at each iteration step, but **after** all selectors and connectors were
        processed.

        7. Creating Edges on a Fast Track

        If the selection of edges includes a simple distance threshold, i.e. a
        selector function defined as follows:

        >>> def ft_selector(x_s, x_t, threshold, sources, targets):
        ...     dx = x_t - x_s
        ...     sources = sources[dx <= threshold]
        ...     targets = targets[dx <= threshold]
        ...     return sources, targets, dx

        the method ``create_edges_ft`` should be considered, since it provides
        a much faster iteration algorithm.

        Parameters
        ----------
        connectors : function or array_like, optional (default=None)
            User defined connector function(s) that compute pairwise relations
            between the nodes in ``v``. A connector accepts multiple column
            names of ``v`` (with '_s' and/or '_t' appended, indicating source
            node values and target node values, respectively) as input, as well
            as already computed relations of former connectors. A connector
            function may have multiple output variables. Every output variable
            has to be a 1-dimensional ``np.ndarray`` (with arbitrary dtype,
            including ``object``). See above and ``dg.functions`` for examplary
            connector functions.

        selectors : function or array_like, optional (default=None)
            User defined selector function(s) that select edges during the
            iteration process, based on some conditions on the node's features
            and their computed relations. Every selector function must have
            ``sources`` and ``targets`` as input arguments as well as in the
            return statement. A selector may depend on column names of ``v``
            (with '_s' and/or '_t' appended) and/or computed relations of
            connector functions, and/or computed relations of former selector
            functions. Apart from ``sources`` and ``targets``, they may also
            return computed relations (see connectors). See above, and
            ``dg.functions`` for exemplary selector functions.

        transfer_features : str, int or array_like, optional (default=None)
            A (list of) column name(s) of ``v``, indicating which features of
            ``v`` to transfer to ``e`` (appending '_s' and '_t' to the column
            names of ``e``, indicating source and target node features,
            respectively).

        r_dtype_dic : dict, optional (default=None)
            A dictionary with names of computed relations of connectors and/or
            selectors as keys and dtypes as values. Forces the data types of
            the computed relations in ``e`` during the iteration (but **after**
            all selectors and connectors were processed), otherwise infers
            them.

        no_transfer_rs : str or array_like, optional (default=None)
            Name(s) of computed relations that are not to be transferred to the
            created edge table ``e``. Can be used to save memory, e.g., if a
            selector depends on computed relations that are of no interest
            otherwise.

        step_size : int, optional (default=1e6)
            The number of pairs of nodes to process at each iteration step.
            Must be in [ 1, g.n*(g.n-1)/2 ]. Its value determines computation
            speed and memory consumption.

        from_pos : int, optional (default=0)
            Determines from which pair of nodes to start the iteration process.
            Must be in [ 0, g.n*(g.n-1)/2 [. May be used in conjuction with
            ``to_pos`` for parallel computation.

        to_pos : positive integer, optional (default=None)
            Determines at which pair of nodes to stop the iteration process
            (the endpoint is excluded). Must be in [ 1, g.n*(g.n-1)/2 ] and
            larger than ``from_pos``. Defaults to None, which translates to the
            last pair of nodes, g.n*(g.n-1)/2. May be used in conjunction with
            ``from_pos`` for parallel computation.

        hdf_key : str, optional (default=None)
            If you initialized ``dg.DeepGraph`` with a ``pandas.HDFStore`` and
            the store has multiple nodes, you must pass the key to the node in
            the store that corresponds to the node table.

        verbose : bool, optional (default=False)
            Whether to print information at each step of the iteration process.

        logfile : str, optional (default=None)
            Create a log-file named by ``logfile``. Contains the time and date
            of the method's call, the input arguments and time mesaurements for
            each iteration step. A plot of ``logfile`` can be created by
            ``dg.DeepGraph.plot_logfile``.

        Returns
        -------
        e : pd.DataFrame
            Set the created edge table ``e`` as attribute of ``dg.DeepGraph``.

        See also
        --------
        create_edges_ft

        Notes
        -----
        1. Input and output data types

        Since connectors (and selectors) take columns of a pandas DataFrame as
        input, there are no restrictions on the data types of which pairwise
        relations are computed. In the most general case, a DataFrame's column
        has ``object`` as dtype, and its values may then be arbitrary Python
        objects. The same goes for the output variables of connectors (and
        selectors). The only requirement is that each ouput variable is
        1-dimensional.

        However, it is also possible to use the values of a column of ``v`` as
        references to arbitrary objects, which may sometimes be more
        convenient. In case a connector (or selector) needs the node's original
        indices as input, one may simply copy them to a column, e.g.

        >>> v['indices'] = v.index

        and then define the connector's (or selector's) input arguments
        accordingly.

        2. Connectors and selectors

        The only requirement on connectors and selectors is that their input
        arguments and return statements are consistent with the column names of
        ``v`` and the passing of computed relations (see above, 3. Hierarchical
        Selection).

        Whatever happens inside the functions is entirely up to the user. This
        means, for instance, that one may wrap arbitrary functions within a
        connector (selector), such as optimized C functions or existing
        functions whose input/output is not consistent with the
        ``create_edges`` method (see, e.g., the methods provided in
        ``dg.functions``, ``scipy`` or scikit learn's ``sklearn.metrics`` and
        ``sklearn.neighbors.DistanceMetric``). One could also store a
        connector's (selector's) computations directly within the function, or
        let the function print out any desired information during iteration.

        3. Why not compute the full adjacency matrix?

        This is due to efficiency. For any asymmetric function (i.e., f(s, t)
        != f(t, s)), one can always create an additional connector (or output
        variable) that computes the mirrored values of that function.

        """

        # logging
        if logfile:
            _, _, _, argvalues = inspect.getargvalues(inspect.currentframe())
            with open(logfile, "w") as log:
                print("# LOG FILE", file=log)
                print("# function call on: {}".format(datetime.now()), file=log)
                print("#", file=log)
                print("# Parameters", file=log)
                print("# ----------", file=log)
                for arg, value in argvalues.items():
                    print("# ", (arg, value), end="", file=log)
                    print("", file=log)
                print("#", file=log)
                print("# Iterations", file=log)
                print("# ----------", file=log)
                print("# max_pairs exceeded(1) | nr.of pairs | nr.of edges | " "comp.time(s)\n", file=log)

        # measure performance
        start_generation = datetime.now()

        # v shortcut
        v = self.v

        # adjust keywords
        min_chunk_size = step_size
        ft_feature = None

        # create empty transfer features list if not given
        if transfer_features is None:
            transfer_features = []
        elif not _is_array_like(transfer_features):
            transfer_features = [transfer_features]

        # hdf_key
        if isinstance(v, pd.HDFStore) and hdf_key is None:
            assert len(v.keys()) == 1, (
                "hdf store has multiple nodes, hdf_key corresponding to the " " node table has to be passed."
            )
            hdf_key = self.v.keys()[0]

        # initialize
        coldtypedic, verboseprint = _initiate_create_edges(
            verbose, v, ft_feature, connectors, selectors, r_dtype_dic, transfer_features, no_transfer_rs, hdf_key
        )

        # iteratively create link data frame (matrix iterator)
        self.e = _matrix_iterator(
            v, min_chunk_size, from_pos, to_pos, coldtypedic, transfer_features, verboseprint, logfile, hdf_key
        )

        # performance
        deltat = datetime.now() - start_generation
        verboseprint("")
        verboseprint(
            "computation time of function call:",
            "\ts =",
            int(deltat.total_seconds()),
            "\tms =",
            str(deltat.microseconds / 1000.0)[:6],
            "\n",
        )

[docs]    def create_edges_ft(
        self,
        ft_feature,
        connectors=None,
        selectors=None,
        transfer_features=None,
        r_dtype_dic=None,
        no_transfer_rs=None,
        min_chunk_size=1000,
        max_pairs=int(1e7),
        from_pos=0,
        to_pos=None,
        hdf_key=None,
        verbose=False,
        logfile=None,
    ):
        """Create (ft) an edge table ``e`` linking the nodes in ``v``.

        This method implements the same functionalities as ``create_edges``,
        with the difference of providing a much quicker iteration algorithm
        based on a so-called fast-track feature. It is advised to read the
        docstring of ``create_edges`` before this one, since only the
        differences are explained in the following.

        Apart from the hierarchical selection through ``connectors`` and
        ``selectors`` as described in the method ``create_edges`` (see 1.-3.),
        this method necessarily includes the (internal) selector function

        >>> def ft_selector(ftf_s, ftf_t, ftt, sources, targets):
        ...     ft_r = ftf_t - ftf_s
        ...     sources = sources[ft_r <= ftt]
        ...     targets = targets[ft_r <= ftt]
        ...     return sources, targets, ft_r

        where ``ftf`` is the fast-track feature (a column name of ``v``),
        ``ftt`` the fast-track threshold (a positive number), and ft_r the
        computed fast-track relation. The argument ``ft_feature``, which has
        to be a tuple (``ftf``, ``ftt``), determines these variables.

        1. The Fast-Track Feature

        The simplest use-case, therefore, is to only pass ``ft_feature``. For
        instance, given a node table

        >>> import pandas as pd
        >>> import deepgraph as dg
        >>> v = pd.DataFrame({'time': [-3.6,-1.1,1.4,4., 6.3],
        ...                   'x': [-3.,3.,1.,12.,7.]})
        >>> g = dg.DeepGraph(v)

        >>> g.v
           time   x
        0  -3.6  -3
        1  -1.1   3
        2   1.4   1
        3   4.0  12
        4   6.3   7

        one may create and select edges by

        >>> g.create_edges_ft(ft_feature=('time', 5))

        >>> g.e
             ft_r
        s t
        0 1   2.5
          2   5.0
        1 2   2.5
        2 3   2.6
          4   4.9
        3 4   2.3

        leaving only edges with a time difference smaller than (or equal to)
        ``ftt`` = 5. Note that the node table always has to be sorted by the
        fast-track feature. This is due to the fact that the algorithm only
        processes pairs of nodes whose fast-track relation is smaller than (or
        equal to) the fast-track threshold, and the (pre)determination of these
        pairs relies on a sorted DataFrame.

        2. Hierarchical Selection

        Additionally, one may define ``connectors`` and ``selectors`` as
        described in ``create_edges`` (see 1.-3.). Per default, the (internal)
        fast-track selector is applied first. It's order of application,
        however, may be determined by inserting the string 'ft_selector' in the
        desired position of the list of ``selectors``.

        The remaining arguments are as described in ``create_edges``, apart
        from ``min_chunk_size``, ``max_pairs``, ``from_pos`` and ``to_pos``. If
        computation time and/or memory consumption are a concern, one may
        therefore read the remaining paragraph.

        3. Parallelization and Memory Control on a FastTrack

        At each iteration step, the algorithm takes a number of nodes (n =
        ``min_chunk_size``, per default n=1000) and computes the fast track
        relation (distance) between the last node and the first node, d_ftf =
        ftf_last - ftf_first. In case d_ftf > ``ftt``, all nodes with a fast-
        track feature < ftf_last - ``ftt`` are considered source nodes, and
        their relations with all n nodes are computed (hierarchical selection).
        In case d_ftf <= ``ftt``, n is increased, s.t. d_ftf > ``ftt``. This
        might lead to a large number of pairs of nodes to process at a given
        iteration step. In order to control memory consumption, one might
        therefore set ``max_pairs`` to a suitable value, triggering a
        subiteration if this value is exceeded.

        In order to parallelize the iterative computation, one may pass the
        arguments ``from_pos`` and ``to_pos``. They determine the range of
        **source nodes** to process (endpoint excluded). Hence, ``from_pos``
        has to be in [0, g.n[, and ``to_pos`` in [1,g.n]. For instance, given
        the node table above

        >>> g.v
           time   x
        0  -3.6  -3
        1  -1.1   3
        2   1.4   1
        3   4.0  12
        4   6.3   7

        we can compute all relations of the source nodes in [1,3[ by

        >>> g.create_edges_ft(ft_feature=('time', 5), from_pos=1, to_pos=3)

        >>> g.e
             ft_r
        s t
        1 2   2.5
        2 3   2.6
          4   4.9

        Like ``create_edges``, this method also works with a ``pd.HDFStore``
        containing the DataFrame representing the node table. Only the data
        requested by ``ft_feature``, ``transfer_features`` and the user-defined
        ``connectors`` and ``selectors`` at each iteration step is then pulled
        from the store. The node table in the store has to be in table(t)
        format, and additionally, the fast_track feature has to be a data
        column. For instance, storing the above node table

        >>> vstore = pd.HDFStore('vstore.h5')
        >>> vstore.put('node_table', v, format='t', data_columns=True,
        ...            index=False)

        one may initiate a DeepGraph instance with the store

        >>> g = dg.DeepGraph(vstore)

        >>> g.v
        <class 'pandas.io.pytables.HDFStore'>
        File path: vstore.h5
        /node_table            frame_table  (typ->appendable,nrows->5,ncols->2,
        indexers->[index],dc->[time,x])

        and then create edges the same way as if ``g.v`` were a DataFrame

        >>> g.create_edges_ft(ft_feature=('time', 5), from_pos=1, to_pos=3)

        >>> g.e
             ft_r
        s t
        1 2   2.5
        2 3   2.6
          4   4.9

        .. warning:: There is no assertion whether the node table in a store is
                     sorted by the fast-track feature! The result of an
                     unsorted table is unpredictable, and generally not
                     correct.

        Parameters
        ----------
        ft_feature : tuple
            A tuple (ftf, ftt), where ftf is a column name of ``v`` (the fast-
            track feature) and ftt a positive number (the fast-track
            threshold). The fast-track feature may contain integers or floats,
            but datetime-like values are also accepted. In that case,
            ``ft_feature`` has to be a tuple of length 3, (ftf, ftt, dt_unit),
            where dt_unit is on of {'D','h','m','s','ms','us','ns'}:

             - `D`: days
             - `h`: hours
             - `m`: minutes
             - `s`: seconds
             - `ms`: milliseconds
             - `us`: microseconds
             - `ns`: nanoseconds

            determining the unit in which the temporal distance is measured.
            The variable name of the fast-track relation transferred to ``e``
            is ``ft_r``.

        connectors : function or array_like, optional (default=None)
            User defined connector function(s) that compute pairwise relations
            between the nodes in ``v``. A connector accepts multiple column
            names of ``v`` (with '_s' and/or '_t' appended, indicating source
            node values and target node values, respectively) as input, as well
            as already computed relations of former connectors. A connector
            function may have multiple output variables. Every output variable
            has to be a 1-dimensional ``np.ndarray`` (with arbitrary dtype,
            including ``object``). A connector may also depend on the fast-
            track relations ('ft_r'). See ``dg.functions`` for examplary
            connector functions.

        selectors : function or array_like, optional (default=None)
            User defined selector function(s) that select edges during the
            iteration process, based on some conditions on the node's features
            and their computed relations. Every selector function must have
            ``sources`` and ``targets`` as input arguments as well as in the
            return statement. A selector may depend on column names of ``v``
            (with '_s' and/or '_t' appended) and/or computed relations of
            connector functions, and/or computed relations of former selector
            functions. Apart from ``sources`` and ``targets``, they may also
            return computed relations (see connectors). A selector may also
            depend on the fast-track relations ('ft_r'). See ``dg.functions``
            for exemplary selector functions.

            Note: To specify the hierarchical order of the selection by the
            fast-track selector, insert the string 'ft_selector' in the
            corresponding position of the ``selectors`` list. Otherwise,
            computation of ft_r and selection by the fast-track selector is
            carried out first.

        transfer_features : str, int or array_like, optional (default=None)
            A (list of) column name(s) of ``v``, indicating which features of
            ``v`` to transfer to ``e`` (appending '_s' and '_t' to the column
            names of ``e``, indicating source and target node features,
            respectively).

        r_dtype_dic : dict, optional (default=None)
            A dictionary with names of computed relations of connectors and/or
            selectors as keys and dtypes as values. Forces the data types of
            the computed relations in ``e`` during the iteration (but **after**
            all selectors and connectors were processed), otherwise infers
            them.

        no_transfer_rs : str or array_like, optional (default=None)
            Name(s) of computed relations that are not to be transferred to the
            created edge table ``e``. Can be used to save memory, e.g., if a
            selector depends on computed relations that are of no interest
            otherwise.

        min_chunk_size : int, optional (default=1000)
            The minimum number of nodes to form pairs of at each iteration
            step. See above for details.

        max_pairs : positive integer, optional (default=1e6)
            The maximum number of pairs of nodes to process at any given
            iteration step. If the number is exceeded, a memory saving
            subiteration is applied.

        from_pos : int, optional (default=0)
            The locational index (.iloc) of ``v`` to start the iteration.
            Determines the range of **source nodes** to process, in conjuction
            with ``to_pos``. Has to be in [0, g.n[, and smaller than
            ``to_pos``. See above for details and an example.

        to_pos : int, optional (default=None)
            The locational index (.iloc) of ``v`` to end the iteration
            (excluded). Determines the range of **source nodes** to process, in
            conjuction with ``from_pos``. Has to be in [1, g.n], and larger
            than ``from_pos``. Defaults to None, which translates to the last
            node of ``v``, to_pos=g.n. See above for details and an example.

        hdf_key : str, optional (default=None)
            If you initialized ``dg.DeepGraph`` with a ``pandas.HDFStore`` and
            the store has multiple nodes, you must pass the key to the node in
            the store that corresponds to the node table.

        verbose : bool, optional (default=False)
            Whether to print information at each step of the iteration process.

        logfile : str, optional (default=None)
            Create a log-file named by ``logfile``. Contains the time and date
            of the method's call, the input arguments and time mesaurements for
            each iteration step. A plot of ``logfile`` can be created by
            ``dg.DeepGraph.plot_logfile``.

        Returns
        -------
        e : pd.DataFrame
            Set the created edge table ``e`` as attribute of ``dg.DeepGraph``.

        See also
        --------
        create_edges

        Notes
        -----
        The parameter ``min_chunk_size`` enforces a vectorized iteration and
        changing its value can both accelerate or slow down computation time.
        This depends mostly on the distribution of values of the fast track
        feature, and the complexity of the given ``connectors`` and
        ``selectors``. Use the logging capabilites to determine a good value.

        When using a ``pd.HDFStore`` for the computation, the following advice
        might be considered. Recall that the only requirements on the node in
        the store are: the format is table(t), not fixed(t); the node is sorted
        by the fast-track feature; and the fast-track feature is a data column.

        The recommended procedure of storing a given node table ``v`` in a
        store is the following (using the above node table):

        >>> vstore = pd.HDFStore('vstore.h5')
        >>> vstore.put('node_table', v, format='t', data_columns=True,
        ...            index=False)

        Setting index=False significantly decreases the time to construct the
        node in the store, and also reduces the resulting file size. It has no
        impact, however, on the capability of querying the store (with the
        pd.HDFStore.select* methods).

        However, there are two reasons one might want to create a pytables
        index of the fast-track feature:

        1. The node table might be too large to be sorted in memory. To sort it
        on disc, one may proceed as follows. Assuming an unsorted (large) node
        table

        >>> v = pd.DataFrame({'time': [6.3,-3.6,4.,-1.1,1.4],
        ...                   'x': [-3.,3.,1.,12.,7.]})

        >>> v
           time   x
        0   6.3  -3
        1  -3.6   3
        2   4.0   1
        3  -1.1  12
        4   1.4   7

        one stores it as recommended

        >>> vstore = pd.HDFStore('vstore.h5')
        >>> vstore.put('node_table', v, format='t', data_columns=True,
        ...            index=False)
        >>> vstore.get_storer('node_table').group.table
        /node_table/table (Table(5,)) ''
          description := {
          "index": Int64Col(shape=(), dflt=0, pos=0),
          "time": Float64Col(shape=(), dflt=0.0, pos=1),
          "x": Float64Col(shape=(), dflt=0.0, pos=2)}
          byteorder := 'little'
          chunkshape := (2730,)

        creates a (full) pytables index of the fast-track feature

        >>> vstore.create_table_index('node_table', columns=['time'],
        ...                           kind='full')
        >>> vstore.get_storer('node_table').group.table
        /node_table/table (Table(5,)) ''
          description := {
          "index": Int64Col(shape=(), dflt=0, pos=0),
          "time": Float64Col(shape=(), dflt=0.0, pos=1),
          "x": Float64Col(shape=(), dflt=0.0, pos=2)}
          byteorder := 'little'
          chunkshape := (2730,)
          autoindex := True
          colindexes := {
            "time": Index(6, full, shuffle, zlib(1)).is_csi=True}

        and then sorts it on disc with

        >>> vstore.close()
        >>> !ptrepack --chunkshape=auto --sortby=time vstore.h5 s_vstore.h5
        >>> s_vstore = pd.HDFStore('s_vstore.h5')

        >>> s_vstore.node_table
           time   x
        1  -3.6   3
        3  -1.1  12
        4   1.4   7
        2   4.0   1
        0   6.3  -3

        2. To speed up the internal queries on the fast-track feature

        >>> s_vstore.create_table_index('node_table', columns=['time'],
        ...                             kind='full')

        See
        http://stackoverflow.com/questions/17893370/ptrepack-sortby-needs-full-index
        and
        https://gist.github.com/michaelaye/810bd0720bb1732067ff
        for details, benchmarks, and the effects of compressing the store.

        """

        # logging
        if logfile:
            _, _, _, argvalues = inspect.getargvalues(inspect.currentframe())
            with open(logfile, "w") as log:
                print("# LOG FILE", file=log)
                print("# function call on: {}".format(datetime.now()), file=log)
                print("#", file=log)
                print("# Parameters", file=log)
                print("# ----------", file=log)
                for arg, value in argvalues.items():
                    print("# ", (arg, value), end="", file=log)
                    print("", file=log)
                print("#", file=log)
                print("# Iterations", file=log)
                print("# ----------", file=log)
                print("# max_pairs exceeded(1) | nr.of pairs | nr.of edges | " "comp.time(s)\n", file=log)

        # measure performance
        start_generation = datetime.now()

        # v shortcut
        v = self.v

        # hdf key
        if isinstance(v, pd.HDFStore) and hdf_key is None:
            assert len(v.keys()) == 1, (
                "hdf store has multiple nodes, hdf_key corresponding to the " "node table has to be passed."
            )
            hdf_key = self.v.keys()[0]

        # datetime?
        if isinstance(v, pd.HDFStore):
            is_datetime = isinstance(pd.Index(v.select_column(hdf_key, ft_feature[0], stop=0)), pd.DatetimeIndex)
        else:
            is_datetime = isinstance(pd.Index(v.iloc[0:0][ft_feature[0]]), pd.DatetimeIndex)

        # for datetime fast track features, split ft_feature
        if is_datetime:
            assert len(ft_feature) == 3, "for a datetime-like fast track feature, " "the unit has to specified"
            dt_unit = ft_feature[-1]
            ft_feature = ft_feature[:2]
        else:
            dt_unit = None

        # create empty transfer features list if not given
        if transfer_features is None:
            transfer_features = []
        elif not _is_array_like(transfer_features):
            transfer_features = [transfer_features]

        # assert that v is sorted by the fast track feature
        if isinstance(v, pd.DataFrame):
            assert pd.Index(
                v[ft_feature[0]]
            ).is_monotonic_increasing, "The node table is not sorted by the fast track feature."

        # initialize
        coldtypedic, verboseprint = _initiate_create_edges(
            verbose, v, ft_feature, connectors, selectors, r_dtype_dic, transfer_features, no_transfer_rs, hdf_key
        )

        # iteratively create link data frame (fast track iterator)
        self.e = _ft_iterator(
            self,
            v,
            min_chunk_size,
            from_pos,
            to_pos,
            dt_unit,
            ft_feature,
            coldtypedic,
            transfer_features,
            max_pairs,
            verboseprint,
            logfile,
            hdf_key,
        )

        # performance
        deltat = datetime.now() - start_generation
        verboseprint("")
        verboseprint(
            "computation time of function call:",
            "\ts =",
            int(deltat.total_seconds()),
            "\tms =",
            str(deltat.microseconds / 1000.0)[:6],
            "\n",
        )

[docs]    def partition_nodes(self, features, feature_funcs=None, n_nodes=True, return_gv=False):
        """Return a supernode DataFrame ``sv``.

        This is essentially a wrapper around the pandas groupby method: ``sv``
        = ``v``.groupby(``features``).agg(``feature_funcs``). It creates a
        (intersection) partition of the nodes in ``v`` by the type(s) of
        feature(s) ``features``, resulting in a supernode DataFrame ``sv``. By
        passing a dictionary of functions on the features of ``v``,
        ``feature_funcs``, one may aggregate user-defined values of the
        partition's elements, the supernodes' features. If ``n_nodes`` is True,
        create a column with the number of each supernode's constituent nodes.
        If ``return_gv`` is True, return the created groupby object to
        facilitate additional operations, such as ``gv``.apply(func, *args,
        **kwargs).

        For details, type help(``v``.groupby), and/or inspect the available
        methods of ``gv``.

        For examples, see below. For an in-depth description and mathematical
        details of graph partitioning, see
        https://arxiv.org/pdf/1604.00971v1.pdf, in particular Sec. III A, E
        and F.

        Parameters
        ----------
        features : str, int or array_like
            Column name(s) of ``v``, indicating the type(s) of feature(s) used
            to induce a (intersection) partition. Creates a pandas groupby
            object, ``gv`` = ``v``.groupby(``features``).

        feature_funcs : dict, optional (default=None)
            Each key must be a column name of ``v``, each value either a
            function, or a list of functions, working when passed a
            ``pandas.DataFrame`` or when passed to ``pandas.DataFrame.apply``.
            See the docstring of ``gv``.agg for details: help(``gv``.agg).

        n_nodes : bool, optional (default=True)
            Whether to create a ``n_nodes`` column in ``sv``, indicating the
            number of nodes in each supernode.

        return_gv : bool, optional (default=False)
            If True, also return the ``v``.groupby(``features``) object,
            ``gv``.

        Returns
        -------
        sv : pd.DataFrame
            The aggreated DataFrame of supernodes, ``sv``.

        gv : pandas.core.groupby.DataFrameGroupBy
            The pandas groupby object, ``v``.groupby(``features``).

        See also
        --------
        partition_edges
        partition_graph

        Notes
        -----
        Currently, NA groups in GroupBy are automatically excluded (silently).
        One workaround is to use a placeholder (e.g., -1, 'none') for NA values
        before doing the groupby (calling this method). See
        http://stackoverflow.com/questions/18429491/groupby-columns-with-nan-missing-values
        and https://github.com/pydata/pandas/issues/3729.

        Examples
        --------
        First, we need a node table, in order to demonstrate its partitioning:

        >>> import pandas as pd
        >>> import deepgraph as dg
        >>> v = pd.DataFrame({'x': [-3.4,2.1,-1.1,0.9,2.3],
        ...                   'time': [0,0,2,2,9],
        ...                   'color': ['g','g','b','g','r'],
        ...                   'size': [1,3,2,3,1]})
        >>> g = dg.DeepGraph(v)
        >>> g.v
          color  size  time    x
        0     g     1     0 -3.4
        1     g     3     0  2.1
        2     b     2     2 -1.1
        3     g     3     2  0.9
        4     r     1     9  2.3

        Create a partition by the type of feature 'color':

        >>> g.partition_nodes('color')
               n_nodes
        color
        b            1
        g            3
        r            1

        Create an intersection partition by the types of features 'color' and
        'size' (which is a further refinement of the last partition):

        >>> g.partition_nodes(['color', 'size'])
                    n_nodes
        color size
        b     2           1
        g     1           1
              3           2
        r     1           1

        Partition by 'color' and collect x values:

        >>> g.partition_nodes('color', {'time': lambda x: list(x)})
               n_nodes       time
        color
        b            1        [2]
        g            3  [0, 0, 2]
        r            1        [9]

        Partition by 'color' and aggregate with different functions:

        >>> g.partition_nodes('color', {'time': [lambda x: list(x), np.max],
        ...                             'x': [np.mean, np.sum, np.std]})
               n_nodes    x_mean  x_sum     x_std time_<lambda>  time_amax
        color
        b            1 -1.100000   -1.1       NaN           [2]          2
        g            3 -0.133333   -0.4  2.891943     [0, 0, 2]          2
        r            1  2.300000    2.3       NaN           [9]          9

        """

        # groupby and aggregate
        gv = self.v.groupby(features)
        sv = _aggregate_super_table(funcs=feature_funcs, size=n_nodes, gt=gv)
        if n_nodes:
            try:
                sv.rename(columns={"size": "n_nodes"}, inplace=True)
            except TypeError:
                sv = sv.rename(columns={"size": "n_nodes"})
        if return_gv:
            return sv, gv
        else:
            return sv

[docs]    def partition_edges(
        self,
        relations=None,
        source_features=None,
        target_features=None,
        relation_funcs=None,
        n_edges=True,
        return_ge=False,
    ):
        """Return a superedge DataFrame ``se``.

        This method allows you to partition the edges in ``e`` by their types
        of relations, but also by the types of features of their incident
        source and target nodes, and any combination of the three.

        Essentially, this method is a wrapper around the pandas groupby method:
        ``se`` = ``e``.groupby(``relations`` + features_s +
        features_t).agg(``relation_funcs``), where ``relations`` are column
        names of ``e``, and in order to group ``e`` by features_s and/or
        features_t, the features of type ``source_features`` and/or
        ``target_features`` (column names of ``v``) are transferred to ``e``,
        appending '_s' and/or '_t' to the corresponding column names of ``e``
        (if they are not already present). The only requirement on the
        combination of ``relations``, ``source_features`` and
        ``target_features`` is that at least on of the lists has to be of
        length >= 1.

        By passing a dictionary of functions on the relations of ``e``,
        ``relation_funcs``, one may aggregate user-defined values of the
        partition's elements, the superedges' relations. If ``n_edges`` is
        True, create a column with the number of each superedge's constituent
        edges. If ``return_ge`` is True, return the created groupby object to
        facilitate additional operations, such as ``ge``.apply(func, *args,
        **kwargs).

        For details, type help(``g.e``.groupby), and/or inspect the available
        methods of ``ge``.

        For examples, see below. For an in-depth description and mathematical
        details of graph partitioning, see
        https://arxiv.org/pdf/1604.00971v1.pdf, in particular Sec. III B, E
        and F.

        Parameters
        ----------
        relations : str, int or array_like, optional (default=None)
            Column name(s) of ``e``, indicating the type(s) of relation(s) used
            to induce a (intersection) partition of ``e`` (in conjunction with
            ``source_features`` and ``target_features``).

        source_features : str, int or array_like, optional (default=None)
            Column name(s) of ``v``, indicating the type(s) of feature(s) of
            the edges' incident source nodes used to induce a (intersection)
            partition of ``e`` (in conjunction with ``relations`` and
            ``target_features``).

        target_features : str, int or array_like, optional (default=None)
            Column name(s) of ``v``, indicating the type(s) of feature(s) of
            the edges' incident target nodes used to induce a (intersection)
            partition of ``e`` (in conjunction with ``relations`` and
            ``source_features``).

        relation_funcs : dict, optional (default=None)
            Each key must be a column name of ``e``, each value a (list of)
            function(s), working when passed a ``pandas.DataFrame`` or when
            passed to ``pandas.DataFrame.apply``. See the docstring of
            ``ge``.agg for details: help(``ge``.agg).

        n_edges : bool, optional (default=True)
            Whether to create a ``n_edges`` column in ``se``, indicating the
            number of edges in each superedge.

        return_ge : bool, optional (default=False)
            If True, also return the pandas groupby object, ``ge``.

        Returns
        -------
        se : pd.DataFrame
            The aggreated DataFrame of superedges, ``se``.

        ge : pandas.core.groupby.DataFrameGroupBy
            The pandas groupby object, ``ge``.

        See also
        --------
        partition_nodes
        partition_graph

        Notes
        -----
        Currently, NA groups in GroupBy are automatically excluded (silently).
        One workaround is to use a placeholder (e.g., -1, 'none') for NA values
        before doing the groupby (calling this method). See
        http://stackoverflow.com/questions/18429491/groupby-columns-with-nan-missing-values
        and https://github.com/pydata/pandas/issues/3729.

        Examples
        --------
        First, we need to create a graph in order to demonstrate how to
        partition its edge set.

        Create a node table:

        >>> import pandas as pd
        >>> import deepgraph as dg
        >>> v = pd.DataFrame({'x': [-3.4,2.1,-1.1,0.9,2.3],
        ...                   'time': [0,1,2,5,9],
        ...                   'color': ['g','g','b','g','r'],
        ...                   'size': [1,3,2,3,1]})
        >>> g = dg.DeepGraph(v)

        >>> g.v
          color  size  time    x
        0     g     1     0 -3.4
        1     g     3     1  2.1
        2     b     2     2 -1.1
        3     g     3     5  0.9
        4     r     1     9  2.3

        Create an edge table:

        >>> def some_relations(ft_r, x_s,x_t,color_s,color_t,size_s,size_t):
        ...     dx = x_t - x_s
        ...     v = dx / ft_r
        ...     same_color = color_s == color_t
        ...     larger_than = size_s > size_t
        ...     return dx, v, same_color, larger_than
        >>> g.create_edges_ft(('time', 5), connectors=some_relations)
        >>> g.e.rename(columns={'ft_r': 'dt'}, inplace=True)
        >>> g.e['inds'] = g.e.index.values  # to ease the eyes

        >>> g.e
              dx  dt larger_than same_color         v    inds
        s t
        0 1  5.5   1       False       True  5.500000  (0, 1)
          2  2.3   2       False      False  1.150000  (0, 2)
          3  4.3   5       False       True  0.860000  (0, 3)
        1 2 -3.2   1        True      False -3.200000  (1, 2)
          3 -1.2   4       False       True -0.300000  (1, 3)
        2 3  2.0   3       False      False  0.666667  (2, 3)
        3 4  1.4   4        True      False  0.350000  (3, 4)

        Partitioning by the type of relation 'larger_than':

        >>> g.partition_edges(relations='larger_than',
        ...                   relation_funcs={'dx': ['mean', 'std'],
        ...                                   'same_color': 'sum'})
                     n_edges  same_color_sum  dx_mean    dx_std
        larger_than
        False              5               3     2.58  2.558711
        True               2               0    -0.90  3.252691

        A refinement of the last partition by the type of relation
        'same_color':

        >>> g.partition_edges(relations=['larger_than', 'same_color'],
        ...                   relation_funcs={'dx': ['mean', 'std'],
        ...                                   'dt': lambda x: tuple(x)})
                                n_edges dt_<lambda>   dx_mean    dx_std
        larger_than same_color
        False       False             2      (2, 3)  2.150000  0.212132
                    True              3   (1, 5, 4)  2.866667  3.572581
        True        False             2      (1, 4) -0.900000  3.252691

        Partitioning by the type of source feature 'color':

        >>> g.partition_edges(source_features='color',
        ...                   relation_funcs={'same_color': 'sum'})
                 n_edges  same_color
        color_s
        b              1           0
        g              6           3

        As one can see, the type of feature 'color' of the source nodes has
        been transferred to ``e``:

        >>> g.e
              dx  dt larger_than same_color         v    inds color_s
        s t
        0 1  5.5   1       False       True  5.500000  (0, 1)       g
          2  2.3   2       False      False  1.150000  (0, 2)       g
          3  4.3   5       False       True  0.860000  (0, 3)       g
        1 2 -3.2   1        True      False -3.200000  (1, 2)       g
          3 -1.2   4       False       True -0.300000  (1, 3)       g
        2 3  2.0   3       False      False  0.666667  (2, 3)       b
        3 4  1.4   4        True      False  0.350000  (3, 4)       g

        A further refinement of the last partition by the type of source
        feature 'size':

        >>> g.partition_edges(source_features=['color', 'size'],
        ...                   relation_funcs={'same_color': 'sum',
        ...                                   'inds': lambda x: tuple(x)})
                        n_edges  same_color                      inds
        color_s size_s
        b       2             1           0                 ((2, 3),)
        g       1             3           2  ((0, 1), (0, 2), (0, 3))
                3             3           1  ((1, 2), (1, 3), (3, 4))

        Partitioning by the types of target features ('color', 'size'):

        >>> g.partition_edges(target_features=['color', 'size'],
        ...                   relation_funcs={'same_color': 'sum',
        ...                                   'inds': lambda x: tuple(x)})
                        n_edges  same_color                              inds
        color_t size_t
        b       2             2           0                  ((0, 2), (1, 2))
        g       3             4           3  ((0, 1), (0, 3), (1, 3), (2, 3))
        r       1             1           0                         ((3, 4),)

        Partitioning by the type of source feature 'color' and the type of
        target feature 'size':

        >>> g.partition_edges(source_features='color', target_features='size',
        ...                   relation_funcs={'same_color': 'sum',
        ...                                   'inds': lambda x: tuple(x)})
                        n_edges  same_color                      inds
        color_s size_t
        b       3             1           0                 ((2, 3),)
        g       1             1           0                 ((3, 4),)
                2             2           0          ((0, 2), (1, 2))
                3             3           3  ((0, 1), (0, 3), (1, 3))

        A further refinement of the last partition by the type of relation
        'larger_than':

        >>> g.partition_edges(relations='larger_than',
        ...                   source_features='color', target_features='size',
        ...                   relation_funcs={'inds': lambda x: tuple(x)})
                                    n_edges                      inds
        larger_than color_s size_t
        False       b       3             1                 ((2, 3),)
                    g       2             1                 ((0, 2),)
                            3             3  ((0, 1), (0, 3), (1, 3))
        True        g       1             1                 ((3, 4),)
                            2             1                 ((1, 2),)

        """

        if not relations:
            relations = []

        if not _is_array_like(relations):
            relations = [relations]

        # transfer feature columns to g.e, for fast groupby
        if source_features:
            if not _is_array_like(source_features):
                source_features = [source_features]
            cols_s = []
            for sf_col in source_features:
                cols_s.append(sf_col + "_s")
                if sf_col + "_s" not in self.e.columns:
                    s = self.e.index.get_level_values(0)
                    self.e.loc[:, sf_col + "_s"] = self.v.loc[s, sf_col].values
        else:
            cols_s = []

        if target_features:
            if not _is_array_like(target_features):
                target_features = [target_features]
            cols_t = []
            for tf_col in target_features:
                cols_t.append(tf_col + "_t")
                if tf_col + "_t" not in self.e.columns:
                    s = self.e.index.get_level_values(1)
                    self.e.loc[:, tf_col + "_t"] = self.v.loc[s, tf_col].values
        else:
            cols_t = []

        cols = relations + cols_s + cols_t

        ge = self.e.groupby(cols)
        se = _aggregate_super_table(funcs=relation_funcs, size=n_edges, gt=ge)

        if n_edges:
            se = se.rename(columns={"size": "n_edges"})

        if return_ge:
            return se, ge
        else:
            return se

[docs]    def partition_graph(
        self, features, feature_funcs=None, relation_funcs=None, n_nodes=True, n_edges=True, return_gve=False
    ):
        """Return supergraph DataFrames ``sv`` and ``se``.

        This method allows partitioning of the  graph represented by ``v`` and
        ``e`` into a supergraph, ``sv`` and ``se``. It creates a (intersection)
        partition of the nodes in ``v`` by the type(s) of feature(s)
        ``features``, together with the (intersection) partition's
        **corresponding** partition of the edges in ``e``.

        Essentially, this method is a wrapper around pandas groupby methods:
        ``sv`` = ``v``.groupby(``features``).agg(``feature_funcs``) and
        ``se`` = ``e``.groupby(features_s+features_t).agg(``relation_funcs``).
        In order to group ``e`` by features_s and features_t, the features of
        type ``features`` are transferred to ``e``, appending '_s' and '_t' to
        the corresponding column names of ``e``, indicating source and target
        features, respectively (if they are not already present).

        By passing a dictionary of functions on the features (relations) of
        ``v`` (``e``), ``feature_funcs`` (``relation_funcs``), one may
        aggregate user-defined values of the partition's elements, the
        supernodes' (superedges') features (relations). If ``n_nodes``
        (``n_edges``) is True, create a column with the number of each
        supernode's (superedge's) constituent nodes (edges).

        If ``return_gve`` is True, return the created groupby objects to
        facilitate additional operations, such as ``gv``.apply(func, *args,
        **kwargs) or ``ge``.apply(func, *args, **kwargs).

        For details, type help(``g.v``.groupby), and/or inspect the available
        methods of ``gv``.

        For examples, see below. For an in-depth description and mathematical
        details of graph partitioning, see
        https://arxiv.org/pdf/1604.00971v1.pdf, in particular Sec. III C, E
        and F.

        Parameters
        ----------
        features : str, int or array_like
            Column name(s) of ``v``, indicating the type(s) of feature(s) used
            to induce a (intersection) partition of ``v``, and its
            **corresponding** partition of the edges in ``e``. Creates pandas
            groupby objects, ``gv`` and ``ge``.

        feature_funcs : dict, optional (default=None)
            Each key must be a column name of ``v``, each value either a
            function, or a list of functions, working when passed a
            ``pandas.DataFrame`` or when passed to ``pandas.DataFrame.apply``.
            See the docstring of ``gv``.agg for details: help(``gv``.agg).

        relation_funcs : dict, optional (default=None)
            Each key must be a column name of ``e``, each value either a
            function, or a list of functions, working when passed a
            ``pandas.DataFrame`` or when passed to ``pandas.DataFrame.apply``.
            See the docstring of ``ge``.agg for details: help(``ge``.agg).

        n_nodes : bool, optional (default=True)
            Whether to create a ``n_nodes`` column in ``sv``, indicating the
            number of nodes in each supernode.

        n_edges : bool, optional (default=True)
            Whether to create a ``n_edges`` column in ``se``, indicating the
            number of edges in each superedge.

        return_gve : bool, optional (default=False)
            If True, also return the pandas groupby objects, ``gv`` and ``ge``.

        Returns
        -------
        sv : pd.DataFrame
            The aggreated DataFrame of supernodes, ``sv``.

        se : pd.DataFrame
            The aggregated DataFrame of superedges, ``se``.

        gv : pandas.core.groupby.DataFrameGroupBy
            The pandas groupby object, ``v``.groupby(``features``).

        ge : pandas.core.groupby.DataFrameGroupBy
            The pandas groupby object, ``e``.groupby(features_i+feaures_j).

        See also
        --------
        partition_nodes
        partition_edges

        Notes
        -----
        Currently, NA groups in GroupBy are automatically excluded (silently).
        One workaround is to use a placeholder (e.g., -1, 'none') for NA values
        before doing the groupby (calling this method). See
        http://stackoverflow.com/questions/18429491/groupby-columns-with-nan-missing-values
        and https://github.com/pydata/pandas/issues/3729.

        Examples
        --------
        First, we need to create a graph in order to demonstrate its
        partitioning into a supergraph.

        Create a node table:

        >>> import pandas as pd
        >>> import deepgraph as dg
        >>> v = pd.DataFrame({'x': [-3.4,2.1,-1.1,0.9,2.3],
        ...                   'time': [0,1,2,5,9],
        ...                   'color': ['g','g','b','g','r'],
        ...                   'size': [1,3,2,3,1]})
        >>> g = dg.DeepGraph(v)

        >>> g.v
          color  size  time    x
        0     g     1     0 -3.4
        1     g     3     1  2.1
        2     b     2     2 -1.1
        3     g     3     5  0.9
        4     r     1     9  2.3

        Create an edge table:

        >>> def some_relations(ft_r, x_s,x_t,color_s,color_t,size_s,size_t):
        ...     dx = x_t - x_s
        ...     v = dx / ft_r
        ...     same_color = color_s == color_t
        ...     larger_than = size_s > size_t
        ...     return dx, v, same_color, larger_than
        >>> g.create_edges_ft(('time', 5), connectors=some_relations)
        >>> g.e.rename(columns={'ft_r': 'dt'}, inplace=True)
        >>> g.e['inds'] = g.e.index.values  # to ease the eyes

        >>> g.e
              dx  dt larger_than same_color         v    inds
        s t
        0 1  5.5   1       False       True  5.500000  (0, 1)
          2  2.3   2       False      False  1.150000  (0, 2)
          3  4.3   5       False       True  0.860000  (0, 3)
        1 2 -3.2   1        True      False -3.200000  (1, 2)
          3 -1.2   4       False       True -0.300000  (1, 3)
        2 3  2.0   3       False      False  0.666667  (2, 3)
        3 4  1.4   4        True      False  0.350000  (3, 4)

        Create a supergraph by partitioning by the type of feature 'color':

        >>> sv, se = g.partition_graph('color')

        >>> sv
               n_nodes
        color
        b            1
        g            3
        r            1

        >>> se
                         n_edges
        color_s color_t
        b       g              1
        g       b              2
                g              3
                r              1

        Create intersection partitions by the types of features 'color' and
        'size' (which are further refinements of the last partitions):

        >>> sv, se = g.partition_graph(
        ...     ['color', 'size'],
        ...     relation_funcs={'inds': lambda x: tuple(x)})

        >>> sv
                    n_nodes
        color size
        b     2           1
        g     1           1
              3           2
        r     1           1

        >>> se
                                       n_edges              inds
        color_s size_s color_t size_t
        b       2      g       3             1         ((2, 3),)
        g       1      b       2             1         ((0, 2),)
                       g       3             2  ((0, 1), (0, 3))
                3      b       2             1         ((1, 2),)
                       g       3             1         ((1, 3),)
                       r       1             1         ((3, 4),)

        Partition by 'color' and aggregate some properties:

        >>> sv, se = g.partition_graph('color',
        ...     feature_funcs={'time': lambda x: list(x)},
        ...     relation_funcs={'larger_than': 'sum', 'same_color': 'sum'})

        >>> sv
               n_nodes       time
        color
        b            1        [2]
        g            3  [0, 1, 5]
        r            1        [9]

        >>> se
                         n_edges larger_than  same_color
        color_s color_t
        b       g              1       False           0
        g       b              2        True           0
                g              3       False           3
                r              1        True           0

        """

        gv = self.v.groupby(features)
        sv = _aggregate_super_table(funcs=feature_funcs, size=n_nodes, gt=gv)
        if n_nodes:
            sv.rename(columns={"size": "n_nodes"}, inplace=True)

        # transfer feature columns to g.e, for fast groupby
        cols_s = []
        cols_t = []
        if not _is_array_like(features):
            features = [features]
        for col in features:
            cols_s.append(col + "_s")
            cols_t.append(col + "_t")
            if col + "_s" not in self.e.columns:
                s = self.e.index.get_level_values(0)
                self.e.loc[:, col + "_s"] = self.v.loc[s, col].values
            if col + "_t" not in self.e.columns:
                t = self.e.index.get_level_values(1)
                self.e.loc[:, col + "_t"] = self.v.loc[t, col].values

        ge = self.e.groupby(cols_s + cols_t)
        se = _aggregate_super_table(funcs=relation_funcs, size=n_edges, gt=ge)
        if n_edges:
            se = se.rename(columns={"size": "n_edges"})

        if return_gve:
            return sv, se, gv, ge
        else:
            return sv, se

[docs]    def return_cs_graph(self, relations=False, dropna=True):
        """Return ``scipy.sparse.coo_matrix`` representation(s).

        Create a compressed sparse graph representation for each type of
        relation given by ``relations``. ``relations`` can either be False,
        True, or a (list of) column name(s) of ``e``. If ``relations`` is False
        (default), return a single csgraph entailing all edges in ``e.index``,
        each with a weight of 1 (in that case, ``dropna`` is discarded). If
        ``relations`` is True, create one csgraph for each column of ``e``,
        where the weights are given by the columns' values. If only a subset of
        columns is to be mapped to csgraphs, ``relations`` has to be a (list
        of) column name(s) of ``e``.

        The argument ``dropna`` indicates whether to discard edges with NA
        values or not. If ``dropna`` is True or False, it applies to all types
        of relations given by ``relations``. However, ``dropna`` can also be
        array_like with the same shape as ``relations`` (or with the same shape
        as ``e.columns``, if ``relations`` is True).

        Parameters
        ----------
        relations : bool, str or array_like, optional (default=False)
            The types of relations to be mapped to scipy csgraphs. Can be
            False, True, or a (list of) column name(s) of ``e``.

        dropna : bool or array_like, optional (default=True)
            Whether to drop edges with NA values. If True or False, applies to
            all relations given by ``relations``. Otherwise, must be the same
            shape as ``relations``. If ``relations`` is False, ``dropna`` is
            discarded.

        Returns
        -------
        csgraph : scipy.sparse.coo_matrix or dict
            A dictionary, where keys are column names of ``e``, and values are
            the corresponding ``scipy.sparse.coo_matrix`` instance(s). If only
            one csgraph is created, return it directly.

        See also
        --------
        return_nx_graph
        return_nx_multigraph
        return_gt_graph

        """

        from scipy.sparse import coo_matrix

        # get indices
        index = self.v.index
        indices = index.values
        n = len(indices)

        # enumerate indices if necessary
        if type(index) is pd.RangeIndex:
            if index.start == 0 and index.stop == n:
                inddic = None
            else:
                inddic = {j: i for i, j in enumerate(indices)}
        else:
            inddic = {j: i for i, j in enumerate(indices)}

        # for default arguments
        if relations is False:
            s = self.e.index.get_level_values(0).values
            t = self.e.index.get_level_values(1).values
            if inddic:
                s = _dic_translator(s, inddic)
                t = _dic_translator(t, inddic)
            else:
                pass

            # create cs graph
            cs_g = coo_matrix((np.ones(len(s), dtype=bool), (s, t)), shape=(n, n), dtype=bool)

        else:
            if relations is True:
                relations = self.e.columns.values

            # check that relations and dropna have the same shape
            if _is_array_like(relations) and _is_array_like(dropna):
                assert len(relations) == len(dropna), "dropna and relations have different shapes!"

            if not _is_array_like(relations):
                relations = [relations]
            if not _is_array_like(dropna):
                dropna = [dropna] * len(relations)

            # create coo_matrices
            cs_g = {}
            for r, drop in zip(relations, dropna):
                if drop:
                    data = self.e[r].dropna()
                else:
                    data = self.e[r]
                s = data.index.get_level_values(0).values
                t = data.index.get_level_values(1).values
                if inddic:
                    s = _dic_translator(s, inddic)
                    t = _dic_translator(t, inddic)
                else:
                    pass

                # create cs graph
                cs_g[r] = coo_matrix((data.values, (s, t)), shape=(n, n), dtype=data.dtype)

            # if there is only one csgraph
            if len(cs_g) == 1:
                cs_g = cs_g[r]

        return cs_g

[docs]    def return_nx_graph(self, features=False, relations=False, dropna="none"):
        """Return a ``networkx.DiGraph`` representation.

        Create a ``networkx.DiGraph`` representation of the graph given by
        ``v`` and ``e``. Node and edge properties to transfer can be indicated
        by the ``features`` and ``relations`` input arguments. Whether to drop
        edges with NA values in the subset of types of relations given by
        ``relations`` can be controlled by ``dropna``.

        Needs pandas >= 0.17.0.

        Parameters
        ----------
        features : bool, str, or array_like, optional (default=False)
            Indicates which types of features to transfer as node attributes.
            Can be column name(s) of ``v``, False or True. If False, create no
            node attributes. If True, create node attributes for every column
            in ``v``. If str or array_like, must be column name(s) of ``v``
            indicating which types of features to transfer.

        relations : bool, str, or array_like, optional (default=False)
            Indicates which types of relations to transfer as edge attributes.
            Can be column name(s) of ``e``, False or True. If False, create no
            edge attributes (all edges in ``e.index`` are transferred,
            regardless of ``dropna``). If True, create edge attributes for
            every column in ``e`` (all edges in ``e.index`` are transferred,
            regardless of ``dropna``). If str or array_like, must be column
            name(s) of ``e`` indicating which types of relations to transfer
            (which edges are transferred can be controlled by ``dropna``).

        dropna : str, optional (default='none')
            One of {'none','any','all'}. If 'none', all edges in ``e.index``
            are transferred. If 'any', drop all edges (rows) in
            ``e[relations]`` where any NA values are present. If 'all', drop
            all edges (rows) in ``e[relations]`` where all values are NA. Only
            has an effect if ``relations`` is str or array_like.

        Returns
        -------
        nx_g : networkx.DiGraph

        See also
        --------
        return_nx_multigraph
        return_cs_graph
        return_gt_graph

        """

        import networkx as nx

        # create empty DiGraph
        nx_g = nx.DiGraph()

        # select features
        if features is False:
            vt = pd.DataFrame(index=self.v.index)
        elif features is True:
            vt = self.v
        elif _is_array_like(features):
            vt = self.v[features]
        else:
            vt = self.v[features].to_frame()

        # create nx compatible tuple, (index, weight_dict)
        vt = vt.to_dict("index")
        vt = ((key, value) for key, value in vt.items())

        # add nodes
        nx_g.add_nodes_from(vt)

        # select relations
        if hasattr(self, "e"):
            if relations is False:
                et = pd.DataFrame(index=self.e.index)

            elif relations is True:
                et = self.e

            elif _is_array_like(relations):
                if dropna != "none":
                    et = self.e[relations].dropna(how=dropna)
                else:
                    et = self.e[relations]

            else:
                if dropna != "none":
                    et = self.e[relations].to_frame().dropna(how=dropna)
                else:
                    et = self.e[relations].to_frame()

            # create nx compatible tuple, (index, index, weight_dict)
            et = et.to_dict("index")
            et = ((key[0], key[1], value) for key, value in et.items())

            # add edges
            nx_g.add_edges_from(et)

        return nx_g

[docs]    def return_nx_multigraph(self, features=False, relations=False, dropna=True):
        """Return a ``networkx.MultiDiGraph`` representation.

        Create a ``networkx.MultiDiGraph`` representation of the graph given by
        ``v`` and ``e``. As opposed to ``return_nx_graph``, where every row of
        ``e`` is treated as one edge, this method treats every cell of
        ``e`` as one edge. The input argument ``features`` indicates which node
        properties to transfer. ``relations`` indicates which edges to
        transfer. Whether to drop edges with NA values can be controlled by
        ``dropna``.

        Needs pandas >= 0.17.0.

        Parameters
        ----------
        features : bool, str, or array_like, optional (default=False)
            Indicates which types of features to transfer as node attributes.
            Can be column name(s) of ``v``, False or True. If False, create no
            node attributes. If True, create node attributes for every column
            in ``v``. If str or array_like, must be column name(s) of ``v``
            indicating which types of features to transfer.

        relations : bool, str, or array_like, optional (default=False)
            Indicates which cells of ``e`` to transfer as edges. Can be False,
            True, or a (list of) column name(s) of ``e``. If False (default),
            all cells of ``e`` are translated to edges, but their values are
            not transferred as edge attributes. If True, all cells of ``e`` are
            translated, and their values are transferred as edge attributes. If
            str or array_like, must be column name(s) of ``e``, restricting the
            translation of cells to edges to ``e[relations]`` (values are
            transferred as edge attributes).

        dropna : bool, optional (default=True)
            Whether to drop edges with NA values. Cells in ``e`` with NA values
            are not translated to edges.

        Returns
        -------
        nx_g : networkx.MultiDiGraph

        See also
        --------
        return_nx_graph
        return_cs_graph
        return_gt_graph

        """

        import networkx as nx

        # create empty MultiDiGraph
        nx_g = nx.MultiDiGraph()

        # select features
        if features is False:
            vt = pd.DataFrame(index=self.v.index)
        elif features is True:
            vt = self.v
        elif _is_array_like(features):
            vt = self.v[features]
        else:
            vt = self.v[features].to_frame()

        # create nx compatible tuple, (index, weight_dict)
        vt = vt.to_dict("index")
        vt = ((key, value) for key, value in vt.items())

        # add nodes
        nx_g.add_nodes_from(vt)

        # select relations
        if hasattr(self, "e"):
            if relations is False:
                if dropna:
                    et = self.e.count(axis=1).to_dict()
                    et = ((key,) * value for key, value in et.items())
                    et = chain(*et)
                else:
                    ncol = len(self.e.columns)
                    et = self.e.index
                    et = ((key,) * ncol for key in et)
                    et = chain(*et)

            elif relations is True:
                et = _iter_edges(self.e, dropna)

            elif _is_array_like(relations):
                et = self.e[relations]
                et = _iter_edges(et, dropna)

            else:
                et = self.e[relations].to_frame()
                et = _iter_edges(et, dropna)

            # add edges
            nx_g.add_edges_from(et)

        return nx_g

[docs]    def return_gt_graph(self, features=False, relations=False, dropna="none", node_indices=False, edge_indices=False):
        """Return a ``graph_tool.Graph`` representation.

        Create a ``graph_tool.Graph`` (directed) representation of the graph
        given by ``v`` and ``e``. Node and edge properties to transfer can be
        indicated by the ``features`` and ``relations`` input arguments.
        Whether to drop edges with NA values in the subset of types of
        relations given by ``relations`` can be controlled by ``dropna``. If
        the nodes in ``v`` are not indexed by consecutive integers starting
        from 0, one may internalize the original node and edge indices as
        propertymaps by setting ``node_indices`` and/or ``edge_indices`` to
        True.

        Parameters
        ----------
        features : bool, str, or array_like, optional (default=False)
            Indicates which types of features to internalize as
            ``graph_tool.PropertyMap``. Can be column name(s) of ``v``, False
            or True. If False, create no propertymaps. If True, create
            propertymaps for every column in ``v``. If str or array_like, must
            be column name(s) of ``v`` indicating which types of features to
            internalize.

        relations : bool, str, or array_like, optional (default=False)
            Indicates which types of relations to internalize as
            ``graph_tool.PropertyMap``. Can be column name(s) of ``e``, False
            or True. If False, create no propertymaps (all edges in ``e.index``
            are transferred, regardless of ``dropna``). If True, create
            propertymaps for every column in ``e`` (all edges in ``e.index``
            are transferred, regardless of ``dropna``). If str or array_like,
            must be column name(s) of ``e`` indicating which types of relations
            to internalize (which edges are transferred can be controlled by
            ``dropna``).

        dropna : str, optional (default='none')
            One of {'none','any','all'}. If 'none', all edges in ``e.index``
            are transferred. If 'any', drop all edges (rows) in
            ``e[relations]`` where any NA values are present. If 'all', drop
            all edges (rows) in ``e[relations]`` where all values are NA. Only
            has an effect if ``relations`` is str or array_like.

        node_indices : bool, optional (default=False)
            If True, internalize a vertex propertymap ``i`` with the original
            node indices.

        edge_indices : bool, optional (default=False)
            If True, internalize edge propertymaps ``s`` and ``t`` with the
            original source and target node indices of the edges, respectively.

        Returns
        -------
        gt_g : graph_tool.Graph

        See also
        --------
        return_cs_graph
        return_nx_graph
        return_nx_multigraph

        Notes
        -----
        If the index of ``v`` is not pd.RangeIndex(start=0,stop=len(``v``),
        step=1), the indices will be enumerated, which is expensive for large
        graphs.

        """

        import graph_tool as gt

        # propertymap dtypes
        dtdic = {
            "bool": "bool",
            # int16_t: 'short',
            "uint8": "int16_t",
            "int8": "int16_t",
            "int16": "int16_t",
            # int32_t: 'int',
            "uint16": "int32_t",
            "int32": "int32_t",
            # int64_t: 'long',
            "uint32": "int64_t",
            "int64": "int64_t",
            "uint64": "int64_t",
            # double: 'float',
            "float16": "double",
            "float32": "double",
            "float64": "double",
            "float128": "double",
        }

        # get indices
        index = self.v.index
        indices = index.values
        n = len(indices)

        # enumerate indices if necessary
        if type(index) is pd.RangeIndex:
            if index.start == 0 and index.stop == n:
                inddic = None
            else:
                inddic = {j: i for i, j in enumerate(indices)}
        else:
            inddic = {j: i for i, j in enumerate(indices)}

        # create empty Graph
        gt_g = gt.Graph(directed=True)

        # select features
        if features is False:
            vt = pd.DataFrame(index=index)
        elif features is True:
            vt = self.v
        elif _is_array_like(features):
            vt = self.v[features]
        else:
            vt = self.v[features].to_frame()

        # add nodes
        gt_g.add_vertex(n)

        # add vertex propertymaps
        if node_indices:
            try:
                pm = gt_g.new_vertex_property(dtdic[str(index.dtype)], indices)
            except KeyError:
                pm = gt_g.new_vertex_property("object", indices)
            # internalize
            gt_g.vertex_properties["i"] = pm

        for col in vt.columns:
            try:
                pm = gt_g.new_vertex_property(dtdic[str(vt[col].dtype)], vt[col].values)
            except KeyError:
                pm = gt_g.new_vertex_property("object", vt[col].values)
            # internalize
            gt_g.vertex_properties[str(col)] = pm

        # select relations
        if hasattr(self, "e"):
            if relations is False:
                et = pd.DataFrame(index=self.e.index)
            elif relations is True:
                et = self.e
            elif _is_array_like(relations):
                if dropna != "none":
                    et = self.e[relations].dropna(how=dropna)
                else:
                    et = self.e[relations]
            else:
                if dropna != "none":
                    et = self.e[relations].to_frame().dropna(how=dropna)
                else:
                    et = self.e[relations].to_frame()

            # add edges
            s = et.index.get_level_values(level=0).values
            t = et.index.get_level_values(level=1).values
            if inddic:
                ns = _dic_translator(s, inddic).astype(int)
                nt = _dic_translator(t, inddic).astype(int)
                gt_g.add_edge_list(np.column_stack((ns, nt)))
                del ns, nt
            else:
                gt_g.add_edge_list(np.column_stack((s, t)))

            # add edge propertymaps
            if edge_indices:
                try:
                    s = gt_g.new_edge_property(dtdic[str(s.dtype)], s)
                    t = gt_g.new_edge_property(dtdic[str(t.dtype)], t)
                except KeyError:
                    s = gt_g.new_edge_property("object", s)
                    t = gt_g.new_edge_property("object", t)
                # internalize
                gt_g.edge_properties["s"] = s
                gt_g.edge_properties["t"] = t

            for col in et.columns:
                try:
                    pm = gt_g.new_edge_property(dtdic[str(et[col].dtype)], et[col].values)
                except KeyError:
                    pm = gt_g.new_edge_property("object", et[col].values)
                # internalize
                gt_g.edge_properties[str(col)] = pm

        return gt_g

[docs]    def append_cp(
        self, directed=False, connection="weak", col_name="cp", label_by_size=True, consolidate_singles=False
    ):
        """Append a component membership column to ``v``.

        Append a column to ``v`` indicating the component membership of each
        node. Requires scipy.

        Parameters
        ----------
        directed : bool, optional (default=False)
            If True , then operate on a directed graph: only move from point i
            to point j along paths csgraph[i, j]. If False, then find the
            shortest path on an undirected graph: the algorithm can progress
            from point i to j along csgraph[i, j] or csgraph[j, i].

        connection : str, optional (default='weak')
            One of {'weak','strong'}. For directed graphs, the type of
            connection to use.  Nodes i and j are strongly connected if a path
            exists both from i to j and from j to i.  Nodes i and j are weakly
            connected if only one of these paths exists. Only has an effect if
            ``directed`` is True

        col_name : str, optional (default='cp')
            The name of the appended column of component labels.

        label_by_size : bool, optional (default=True)
            Whether to rename component membership labels to reflect component
            sizes. If True, the smallest component corresponds to the largest
            label, and the largest component corresponds to the label 0 (or 1
            if ``consolidate_singles`` is True). If False, pass on labels given
            by scipy's connected_components method directly (faster and uses
            less memory).

        consolidate_singles: bool, optional (default=False)
            If True, all singular components (components comprised of one node
            only) are consolidated under the label 0. Also, all other labels
            are renamed to reflect component sizes, see ``label_by_size``.

        Returns
        -------
        v : pd.DataFrame
            appends an extra column to ``v`` indicating component membership.

        """

        from scipy.sparse.csgraph import connected_components

        # create cs graph
        cs_g = self.return_cs_graph()

        # find components
        labels = connected_components(cs_g, directed=directed, connection=connection)[1]

        # append cp column to v
        self.v[col_name] = labels

        # if indicated, consolidate singular components and label by size
        if consolidate_singles:
            cp_counts = self.v[col_name].value_counts()

            # if there are singular components
            f1cp = len(cp_counts) - np.searchsorted(cp_counts.values[::-1], 2)
            rndic = {j: i + 1 for i, j in enumerate(cp_counts.index[:f1cp])}
            rndic.update({i: 0 for i in cp_counts.index[f1cp:]})

            # relabel cp column
            self.v[col_name] = self.v[col_name].apply(lambda x: rndic[x])

        # if indicated, label by size
        elif label_by_size:
            cp_counts = self.v[col_name].value_counts()
            rndic = {j: i for i, j in enumerate(cp_counts.index)}

            # relabel cp column
            self.v[col_name] = self.v[col_name].apply(lambda x: rndic[x])

[docs]    def append_binning_labels_v(self, col, col_name, bins=10, log_bins=False, floor=False, return_bin_edges=False):
        """Append a column with binning labels of the values in ``v[col]``.

        Append a column ``col_name`` to ``v`` with the indices of the bins to
        which each value in ``v[col]`` belongs to.

        If ``bins`` is an int, it determines the number of bins to create. If
        ``log_bins`` is True, this number determines the (approximate) number
        of bins to create for each magnitude. For linear bins, it is the number
        of bins for the whole range of values. If ``floor`` is set True, the
        bin edges are floored to the closest integer. If ``return_bin_edges``
        is set True, the created bin edges are returned.

        If ``bins`` is a sequence, it defines the bin edges, including the
        rightmost edge, allowing for non-uniform bin widths.

        See ``np.digitize`` for details.

        Parameters
        ----------
        col : int or str
            A column name of ``v``, whose corresponding values are binned and
            labelled.

        col_name : str
            The column name for the created labels.

        bins : int or array_lke, optional (default=10)
            If ``bins`` is an int, it determines the number of bins to create.
            If ``log_bins`` is True, this number determines the (approximate)
            number of bins to create for each magnitude. For linear bins, it is
            the number of bins for the whole range of values. If ``bins`` is a
            sequence, it defines the bin edges, including the rightmost edge,
            allowing for non-uniform bin widths.

        log_bins : bool, optional (default=False)
            Whether to use logarithmically or linearly spaced bins.

        floor : bool, optional (default=False)
            Whether to floor the bin edges to the closest integers.

        return_bin_edges : bool, optional (default=False)
            Whether to return the bin edges.

        Returns
        -------
        v : pd.DataFrame
            Appends an extra column ``col_name`` to ``v`` with the binning
            labels.

        bin_edges : np.ndarray
            Optionally, return the created bin edges.

        Examples
        --------
        First, we need a node table:

        >>> import pandas as pd
        >>> import deepgraph as dg
        >>> v = pd.DataFrame({'time': [1,2,12,105,899]})
        >>> g = dg.DeepGraph(v)

        >>> g.v
           time
        0     1
        1     2
        2    12
        3   105
        4   899

        Binning time values with default arguments:

        >>> bin_edges = g.append_binning_labels_v('time', 'time_l',
        ...                                       return_bin_edges=True)

        >>> bin_edges
        array([   1.        ,  100.77777778,  200.55555556,  300.33333333,
                400.11111111,  499.88888889,  599.66666667,  699.44444444,
                799.22222222,  899.        ])

        >>> g.v
           time  time_l
        0     1       1
        1     2       1
        2    12       1
        3   105       2
        4   899      10

        Binning time values with logarithmically spaced bins:

        >>> bin_edges = g.append_binning_labels_v('time', 'time_l', bins=5,
        ...                                       log_bins=True,
        ...                                       return_bin_edges=True)

        >>> bin_edges
        array([   1.        ,    1.62548451,    2.64219989,    4.29485499,
                  6.98122026,   11.34786539,   18.44577941,   29.9833287 ,
                 48.73743635,   79.22194781,  128.77404899,  209.32022185,
                340.24677814,  553.06586728,  899.        ])

        >>> g.v
           time  time_l
        0     1       1
        1     2       2
        2    12       6
        3   105      10
        4   899      15

        Binning time values with logarithmically spaced bins (floored):

        >>> bin_edges = g.append_binning_labels_v('time', 'time_l', bins=5,
        ...                                       log_bins=True, floor=True,
        ...                                       return_bin_edges=True)

        >>> bin_edges
        array([   1.,    2.,    4.,    6.,   11.,   18.,   29.,   48.,   79.,
                128.,  209.,  340.,  553.,  899.])

        >>> g.v
           time  time_l
        0     1       1
        1     2       2
        2    12       5
        3   105       9
        4   899      14

        """

        x = self.v[col]

        # create bins
        if _is_array_like(bins):
            bin_edges = bins
        else:
            bin_edges = _create_bin_edges(x, bins, log_bins, floor)

        self.v[col_name] = np.digitize(x, bin_edges)

        if return_bin_edges:
            return bin_edges

    def append_datetime_categories_v(self, col="time", timeofday=None, met_season=None):
        """Append datetime categories to ``v``.

        Appends a "time of the day" and/or a meteorological season to ``v``,
        based on a given datetime column ``col``.

        Parameters
        ----------
        col : str, optional (default='time')
            A column of ``v`` comprised of datetimes.

        timeofday : str, optional (default=None)
            If given, the time of the day is appended as a column with the
            label ``timeofday`` to ``v``. The time of the day is defined
            as::

                [00:06[ = 0 (night)
                [06:12[ = 1 (forenoon)
                [12:18[ = 2 (afternoon)
                [18:24] = 3 (evening)

        met_season : str, optional (default=None)
            If given, the modern mid-latitude meteorological season, see
            http://en.wikipedia.org/wiki/Season#Modern_mid-latitude_meteorological
            is appended as a column with the label
            ``met_season`` to ``v``. The season is defined as:

                [12:03[ = 0
                [03:06[ = 1
                [06:09[ = 2
                [09:12[ = 3

        Returns
        -------
        v : pd.DataFrame
            appends extra column(s) to ``v`` with datetime properties.

        """

        def _timeofday(datetimes):
            def categorize(hour):
                if hour < 6:
                    return 0
                elif hour >= 6 and hour < 12:
                    return 1
                elif hour >= 12 and hour < 18:
                    return 2
                elif hour >= 18 and hour <= 24:
                    return 3

            hour = datetimes.apply(lambda x: x.hour)
            timeofday = hour.apply(categorize).values
            return timeofday

        def _met_season(datetimes):
            def season(month):
                if month >= 12 or month < 3:
                    return 0
                elif month >= 3 and month < 6:
                    return 1
                elif month >= 6 and month < 9:
                    return 2
                elif month >= 9 and month < 12:
                    return 3

            month = datetimes.apply(lambda x: x.month)
            season = month.apply(season).values
            return season

        if timeofday:
            self.v[timeofday] = _timeofday(self.v[col])
            self.v[timeofday] = self.v[timeofday].astype("uint8")

        if met_season:
            self.v[met_season] = _met_season(self.v[col])
            self.v[met_season] = self.v[met_season].astype("uint8")

[docs]    def update_edges(self):
        """After removing nodes in ``v``, update ``e``.

        If you deleted rows from ``v``, you can remove all edges associated
        with the deleted nodes in ``e`` by calling this method.

        Returns
        -------
        e : pd.DataFrame
            update ``e``

        """

        # reduce edge table
        if hasattr(self, "e"):
            s = self.e.index.get_level_values(0)
            t = self.e.index.get_level_values(1)
            self.e = self.e.loc[(s.isin(self.v.index)) & (t.isin(self.v.index))]

[docs]    def filter_by_interval_v(self, col, interval, endpoint=True):
        """Keep only nodes in ``v`` with features of type ``col`` in
        ``interval``.

        Remove all nodes from ``v`` (and their corresponding edges in ``e``)
        with features of type ``col`` outside the interval given by a tuple of
        values. The endpoint is included, if ``endpoint`` is not set to False.

        Parameters
        ----------
        col : str or int
            A column name of ``v``, indicating the type of feature used in the
            filtering.

        interval : tuple
            A tuple of two values, (value, larger_value). All nodes outside the
            interval are removed.

        endpoint : bool, optional (default=True)
            False excludes the endpoint.

        Returns
        -------
        v : pd.DataFrame
            update ``v``

        e : pd.DataFrame
            update ``e``

        """

        # reduce node table
        if endpoint:
            self.v = self.v[(self.v[col] >= interval[0]) & (self.v[col] <= interval[1])]
        else:
            self.v = self.v[(self.v[col] >= interval[0]) & (self.v[col] < interval[1])]

        # reduce edge table
        if hasattr(self, "e"):
            self.update_edges()

[docs]    def filter_by_interval_e(self, col, interval, endpoint=True):
        """Keep only edges in ``e`` with relations of type ``col`` in
        ``interval``.

        Remove all edges from ``e`` with relations of type ``col`` outside the
        interval given by a tuple of values. The endpoint is included, if
        ``endpoint`` is not set to False.

        Parameters
        ----------
        col : str or int
            A column name of ``e``, indicating the type of relation used in the
            filtering.

        interval : tuple
            A tuple of two values, (value, larger_value). All edges outside the
            interval are removed.

        endpoint : bool, optional (default=True)
            False excludes the endpoint.

        Returns
        -------
        e : pd.DataFrame
            update ``e``

        """

        # reduce edge table
        if endpoint:
            self.e = self.e[(self.e[col] >= interval[0]) & (self.e[col] <= interval[1])]
        else:
            self.e = self.e[(self.e[col] >= interval[0]) & (self.e[col] < interval[1])]

[docs]    def filter_by_values_v(self, col, values):
        """Keep only nodes in ``v`` with features of type ``col`` in
        ``values``.

        Remove all nodes from ``v`` (and their corresponding edges in
        ``e``) with feature(s) of type ``col`` not in the list of features
        given by ``values``.

        Parameters
        ----------
        col : str or int
            A column name of ``v``, indicating the type of feature used in the
            filtering.

        values : object or array_like
            The value(s) indicating which nodes to keep.

        Returns
        -------
        v : pd.DataFrame
            update ``v``

        e : pd.DataFrame
            update ``e``

        """

        # reduce node table
        if not _is_array_like(values):
            values = [values]
        self.v = self.v[(self.v[col].isin(values))]

        # reduce edge table
        if hasattr(self, "e"):
            self.update_edges()

[docs]    def filter_by_values_e(self, col, values):
        """Keep only edges in ``e`` with relations of type ``col`` in
        ``values``.

        Remove all edges from ``e`` with relation(s) of type ``col`` not in the
        list of relations given by ``values``.

        Parameters
        ----------
        col : str or int
            A column name of ``e``, indicating the type of relation used in the
            filtering.

        values : object or array_like
            The value(s) indicating which edges to keep.

        Returns
        -------
        e : pd.DataFrame
            update ``e``

        """

        # reduce node table
        if not _is_array_like(values):
            values = [values]
        self.e = self.e[(self.e[col].isin(values))]

[docs]    def plot_2d(
        self,
        x,
        y,
        edges=False,
        C=None,
        C_split_0=None,
        kwds_scatter=None,
        kwds_quiver=None,
        kwds_quiver_0=None,
        ax=None,
    ):
        """Plot nodes and corresponding edges in 2 dimensions.

        Create a scatter plot of the nodes in ``v``, and optionally a quiver
        plot of the corresponding edges in ``e``.

        The xy-coordinates of the scatter plot are determined by the values of
        ``v[x]`` and ``v[y]``, where ``x`` and ``y`` are column names of ``v``
        (the arrow's coordinates are determined automatically).

        In order to map colors to the arrows, either ``C`` or ``C_split_0``
        can be be passed, an array of the same length as ``e``. Passing ``C``
        creates a single quiver plot (qu). Passing ``C_split_0`` creates two
        separate quiver plots, one for all edges where ``C_split_0`` == 0
        (qu_0), and one for all other edges (qu). By default, the arrows of
        qu_0 have no head, indicating "undirected" edges. This can be useful,
        for instance, when ``C_split_0`` represents an array of temporal
        distances.

        In order to control the plotting parameters of the scatter, quiver
        and/or quiver_0 plots, one may pass keyword arguments by setting
        ``kwds_scatter``, ``kwds_quiver`` and/or ``kwds_quiver_0``.

        Can be used iteratively by passing ``ax``.

        Parameters
        ----------
        x : int or str
            A column name of ``v``, determining the x-coordinates of the
            scatter plot of nodes.

        y : int or str
            A column name of ``v``, determining the y-coordinates of the
            scatter plot of nodes.

        edges : bool, optional (default=True)
            Whether to create a quiver plot (2-D field of arrows) of the edges
            between the nodes.

        C : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. Has no effect if ``C_split_0`` is passed as
            an argument.

        C_split_0 : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. If this parameter is passed, ``C`` has no
            effect, and two separate quiver plots are created (qu and qu_0).

        kwds_scatter : dict, optional (default=None)
            kwargs to be passed to scatter.

        kwds_quiver : dict, optional (default=None)
            kwargs to be passed to quiver (qu).

        kwds_quiver_0 : dict, optional (default=None)
            kwargs to be passed to quiver (qu_0). Only has an effect if
            ``C_split_0`` has been set.

        ax : matplotlib axes object, optional (default=None)
            An axes instance to use.

        Returns
        -------
        obj : dict
            If ``C_split_0`` has been passed, return a dict of matplotlib
            objects with the following keys: ['fig', 'ax', 'pc', 'qu', 'qu_0'].
            Otherwise, return a dict with keys: ['fig', 'ax', 'pc', 'qu'].

        Notes
        -----
        When passing ``C_split_0``, the color of the arrows in qu_0 can be set
        by passing the keyword argument `color` to ``kwds_quiver_0``. The color
        of the arrows in qu, however, are determined by ``C_split_0``.

        The default drawing order is set to:
        1. quiver_0 (zorder=1)
        2. quiver (zorder=2)
        3. scatter (zorder=3)
        This order can be changed by setting the ``zorder`` in
        ``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
        See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html

        See also
        --------
        plot_2d_generator
        plot_3d
        plot_map
        plot_map_generator

        """

        return self._plot_2d(
            is_map=False,
            x=x,
            y=y,
            edges=edges,
            C=C,
            C_split_0=C_split_0,
            kwds_scatter=kwds_scatter,
            kwds_quiver=kwds_quiver,
            kwds_quiver_0=kwds_quiver_0,
            kwds_basemap=None,
            ax=ax,
            m=None,
        )

[docs]    def plot_2d_generator(
        self,
        x,
        y,
        by,
        edges=False,
        C=None,
        C_split_0=None,
        kwds_scatter=None,
        kwds_quiver=None,
        kwds_quiver_0=None,
        passable_ax=False,
    ):
        """Plot nodes and corresponding edges by groups.

        Create a generator of scatter plots of the nodes in ``v``, split in
        groups by ``v``.groupby(``by``). If edges is set True, also create a
        quiver plot of each group's corresponding edges.

        The xy-coordinates of the scatter plots are determined by the values of
        ``v[x]`` and ``v[y]``, where ``x`` and ``y`` are column names of ``v``
        (the arrow's coordinates are determined automatically).

        In order to map colors to the arrows, either ``C`` or ``C_split_0``
        can be be passed, an array of the same length as ``e``. Passing ``C``
        creates a single quiver plot (qu). Passing ``C_split_0`` creates two
        separate quiver plots, one for all edges where ``C_split_0`` == 0
        (qu_0), and one for all other edges (qu). By default, the arrows of
        qu_0 have no head, indicating "undirected" edges. This can be useful,
        for instance, when ``C_split_0`` represents an array of temporal
        distances.

        When mapping colors to arrows by setting ``C`` (or ``C_split_0``),
        `clim` is automatically set to the min and max values of the entire
        array. In case one wants clim to be set to min and max values for each
        group's colors, one may explicitly pass `clim` = None to
        ``kwds_quiver``.

        The same behaviour occurs when passing a sequence of ``g.n`` Numbers as
        colors `c` to ``kwds_scatter``. In that case, `vmin` and `vmax` are
        automatically set to `c`.min() and `c`.max() of all nodes. Explicitly
        setting `vmin` and `vmax` to `None`, the min and max values of the
        groups' color arrays are used.

        In order to control the plotting parameters of the scatter, quiver
        and/or quiver_0 plots, one may pass keyword arguments by setting
        ``kwds_scatter``, ``kwds_quiver`` and/or ``kwds_quiver_0``.

        If ``passable_ax`` is True, create a generator of functions. Each
        function takes a matplotlib axes object as input, and returns a
        scatter/quiver plot.

        Parameters
        ----------
        x : int or str
            A column name of ``v``, determining the x-coordinates of the
            scatter plot of nodes.

        y : int or str
            A column name of ``v``, determining the y-coordinates of the
            scatter plot of nodes.

        by : array_like
            Column name(s) of ``v``, determining the groups to create plots of.

        edges : bool, optional (default=True)
            Whether to create a quiver plot (2-D field of arrows) of the edges
            between the nodes.

        C : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. Has no effect if ``C_split_0`` is passed as
            an argument.

        C_split_0 : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. If this parameter is passed, ``C`` has no
            effect, and two separate quiver plots are created (qu and qu_0).

        kwds_scatter : dict, optional (default=None)
            kwargs to be passed to scatter.

        kwds_quiver : dict, optional (default=None)
            kwargs to be passed to quiver (qu).

        kwds_quiver_0 : dict, optional (default=None)
            kwargs to be passed to quiver (qu_0). Only has an effect if
            ``C_split_0`` has been set.

        passable_ax : bool, optional (default=False)
            If True, return a generator of functions. Each function takes a
            matplotlib axes object as input, and returns a dict of matplotlib
            objects.

        Returns
        -------
        obj : generator
            If ``C_split_0`` has been passed, return a generator of dicts of
            matplotlib objects with the following keys: ['fig', 'ax', 'pc',
            'qu', 'qu_0', 'group']. Otherwise, return a generator of dicts
            with keys: ['fig', 'ax', 'pc', 'qu', 'group'].
            If ``passable_ax`` is True, return a generator of functions. Each
            function takes a matplotlib axes object as input, and returns a
            dict as described above.

        Notes
        -----
        When passing ``C_split_0``, the color of the arrows in qu_0 can be set
        by passing the keyword argument `color` to ``kwds_quiver_0``. The color
        of the arrows in qu, however, are determined by ``C_split_0``.

        The default drawing order is set to:
        1. quiver_0 (zorder=1)
        2. quiver (zorder=2)
        3. scatter (zorder=3)
        This order can be changed by setting the ``zorder`` in
        ``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
        See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html

        See also
        --------
        append_binning_labels_v
        plot_2d
        plot_3d
        plot_map
        plot_map_generator

        """

        return self._plot_2d_generator(
            is_map=False,
            x=x,
            y=y,
            by=by,
            edges=edges,
            C=C,
            C_split_0=C_split_0,
            kwds_basemap=None,
            kwds_scatter=kwds_scatter,
            kwds_quiver=kwds_quiver,
            kwds_quiver_0=kwds_quiver_0,
            passable_ax=passable_ax,
        )

[docs]    def plot_map(
        self,
        lon,
        lat,
        edges=False,
        C=None,
        C_split_0=None,
        kwds_basemap=None,
        kwds_scatter=None,
        kwds_quiver=None,
        kwds_quiver_0=None,
        ax=None,
        m=None,
    ):
        """Plot nodes and corresponding edges on a basemap.

        Create a scatter plot of the nodes in ``v`` and optionally a quiver
        plot of the corresponding edges in ``e`` on a
        ``mpl_toolkits.basemap.Basemap`` instance.

        The coordinates of the scatter plot are determined by the node's
        longitudes and latitudes (in degrees): ``v[lon]`` and ``v[lat]``, where
        ``lon`` and ``lat`` are column names of ``v`` (the arrow's coordinates
        are determined automatically).

        In order to map colors to the arrows, either ``C`` or ``C_split_0``
        can be be passed, an array of the same length as ``e``. Passing ``C``
        creates a single quiver plot (qu). Passing ``C_split_0`` creates two
        separate quiver plots, one for all edges where ``C_split_0`` == 0
        (qu_0), and one for all other edges (qu). By default, the arrows of
        qu_0 have no head, indicating "undirected" edges. This can be useful,
        for instance, when ``C_split_0`` represents an array of temporal
        distances.

        In order to control the parameters of the basemap, scatter, quiver
        and/or quiver_0 plots, one may pass keyword arguments by setting
        ``kwds_basemap``, ``kwds_scatter``, ``kwds_quiver`` and/or
        ``kwds_quiver_0``.

        Can be used iteratively by passing ``ax`` and/or ``m``.

        Parameters
        ----------
        lon : int or str
            A column name of ``v``. The corresponding values must be longitudes
            in degrees.

        lat : int or str
            A column name of ``v``. The corresponding values must be latitudes
            in degrees.

        edges : bool, optional (default=True)
            Whether to create a quiver plot (2-D field of arrows) of the edges
            between the nodes.

        C : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. Has no effect if ``C_split_0`` is passed as
            an argument.

        C_split_0 : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. If this parameter is passed, ``C`` has no
            effect, and two separate quiver plots are created (qu and qu_0).

        kwds_basemap : dict, optional (default=None)
            kwargs passed to basemap.

        kwds_scatter : dict, optional (default=None)
            kwargs to be passed to scatter.

        kwds_quiver : dict, optional (default=None)
            kwargs to be passed to quiver (qu).

        kwds_quiver_0 : dict, optional (default=None)
            kwargs to be passed to quiver (qu_0). Only has an effect if
            ``C_split_0`` has been set.

        ax : matplotlib axes object, optional (default=None)
            An axes instance to use.

        m : Basemap object, optional (default=None)
            A mpl_toolkits.basemap.Basemap instance to use.

        Returns
        -------
        obj : dict
            If ``C_split_0`` has been passed, return a dict of matplotlib
            objects with the following keys: ['fig', 'ax', 'm', 'pc', 'qu',
            'qu_0']. Otherwise, return a dict with keys: ['fig', 'ax', 'm',
            'pc', 'qu'].

        Notes
        -----
        When passing ``C_split_0``, the color of the arrows in qu_0 can be set
        by passing the keyword argument `color` to ``kwds_quiver_0``. The color
        of the arrows in qu, however, are determined by ``C_split_0``.

        The default drawing order is set to:
        1. quiver_0 (zorder=1)
        2. quiver (zorder=2)
        3. scatter (zorder=3)
        This order can be changed by setting the ``zorder`` in
        ``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
        See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html

        See also
        --------
        plot_map_generator
        plot_2d
        plot_2d_generator
        plot_3d

        """

        return self._plot_2d(
            is_map=True,
            x=lon,
            y=lat,
            edges=edges,
            C=C,
            C_split_0=C_split_0,
            kwds_basemap=kwds_basemap,
            kwds_scatter=kwds_scatter,
            kwds_quiver=kwds_quiver,
            kwds_quiver_0=kwds_quiver_0,
            ax=ax,
            m=m,
        )

[docs]    def plot_map_generator(
        self,
        lon,
        lat,
        by,
        edges=False,
        C=None,
        C_split_0=None,
        kwds_basemap=None,
        kwds_scatter=None,
        kwds_quiver=None,
        kwds_quiver_0=None,
        passable_ax=False,
    ):
        """Plot nodes and corresponding edges by groups, on basemaps.

        Create a generator of scatter plots of the nodes in ``v``, split in
        groups by ``v``.groupby(``by``), on a ``mpl_toolkits.basemap.Basemap``
        instance. If edges is set True, also create a quiver plot of each
        group's corresponding edges.

        The coordinates of the scatter plots are determined by the node's
        longitudes and latitudes (in degrees): ``v[lon]`` and ``v[lat]``, where
        ``lon`` and ``lat`` are column names of ``v`` (the arrow's coordinates
        are determined automatically).

        In order to map colors to the arrows, either ``C`` or ``C_split_0``
        can be be passed, an array of the same length as ``e``. Passing ``C``
        creates a single quiver plot (qu). Passing ``C_split_0`` creates two
        separate quiver plots, one for all edges where ``C_split_0`` == 0
        (qu_0), and one for all other edges (qu). By default, the arrows of
        qu_0 have no head, indicating "undirected" edges. This can be useful,
        for instance, when ``C_split_0`` represents an array of temporal
        distances.

        When mapping colors to arrows by setting ``C`` (or ``C_split_0``),
        `clim` is automatically set to the min and max values of the entire
        array. In case one wants clim to be set to min and max values for each
        group's colors, one may explicitly pass `clim` = None to
        ``kwds_quiver``.

        The same behaviour occurs when passing a sequence of ``g.n`` Numbers as
        colors `c` to ``kwds_scatter``. In that case, `vmin` and `vmax` are
        automatically set to `c`.min() and `c`.max() of all nodes. Explicitly
        setting `vmin` and `vmax` to `None`, the min and max values of the
        groups' color arrays are used.

        In order to control the parameters of the basemap, scatter, quiver
        and/or quiver_0 plots, one may pass keyword arguments by setting
        ``kwds_basemap``, ``kwds_scatter``, ``kwds_quiver`` and/or
        ``kwds_quiver_0``.

        If ``passable_ax`` is True, create a generator of functions. Each
        function takes a matplotlib axes object (and/or a Basemap object) as
        input, and returns a scatter/quiver plot.

        Parameters
        ----------
        lon : int or str
            A column name of ``v``. The corresponding values must be longitudes
            in degrees.

        lat : int or str
            A column name of ``v``. The corresponding values must be latitudes
            in degrees.

        by : array_like
            Column name(s) of ``v``, determining the groups to create plots of.

        edges : bool, optional (default=True)
            Whether to create a quiver plot (2-D field of arrows) of the edges
            between the nodes.

        C : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. Has no effect if ``C_split_0`` is passed as
            an argument.

        C_split_0 : array_like, optional (default=None)
            An optional array used to map colors to the arrows. Must have the
            same length es ``e``. If this parameter is passed, ``C`` has no
            effect, and two separate quiver plots are created (qu and qu_0).

        kwds_basemap : dict, optional (default=None)
            kwargs passed to basemap.

        kwds_scatter : dict, optional (default=None)
            kwargs to be passed to scatter.

        kwds_quiver : dict, optional (default=None)
            kwargs to be passed to quiver (qu).

        kwds_quiver_0 : dict, optional (default=None)
            kwargs to be passed to quiver (qu_0). Only has an effect if
            ``C_split_0`` has been set.

        passable_ax : bool, optional (default=False)
            If True, return a generator of functions. Each function takes a
            matplotlib axes object (and/or a Basemap object) as input, and
            returns a dict of matplotlib objects.

        Returns
        -------
        obj : generator
            If ``C_split_0`` has been passed, return a generator of dicts of
            matplotlib objects with the following keys: ['fig', 'ax', 'm',
            'pc', 'qu', 'qu_0', 'group']. Otherwise, return a generator of
            dicts with keys: ['fig', 'ax', 'm', 'pc', 'qu', 'group'].
            If ``passable_ax`` is True, return a generator of functions. Each
            function takes a matplotlib axes object (and/or a Basemap object)
            as input, and returns a dict as described above.

        Notes
        -----
        When passing ``C_split_0``, the color of the arrows in qu_0 can be set
        by passing the keyword argument `color` to ``kwds_quiver_0``. The color
        of the arrows in qu, however, are determined by ``C_split_0``.

        The default drawing order is set to:
        1. quiver_0 (zorder=1)
        2. quiver (zorder=2)
        3. scatter (zorder=3)
        This order can be changed by setting the ``zorder`` in
        ``kwds_quiver_0``, ``kwds_quiver`` and/or ``kwds_scatter``.
        See also http://matplotlib.org/examples/pylab_examples/zorder_demo.html

        See also
        --------
        append_binning_labels_v
        plot_map
        plot_2d
        plot_2d_generator
        plot_3d

        """

        return self._plot_2d_generator(
            is_map=True,
            x=lon,
            y=lat,
            by=by,
            edges=edges,
            C=C,
            C_split_0=C_split_0,
            kwds_basemap=kwds_basemap,
            kwds_scatter=kwds_scatter,
            kwds_quiver=kwds_quiver,
            kwds_quiver_0=kwds_quiver_0,
            passable_ax=passable_ax,
        )

    def plot_3d(self, x, y, z, edges=False, kwds_scatter=None, kwds_quiver=None, ax=None):
        """Work in progress!

        experimental, quiver3D scaling?

        See also
        --------
        plot_2d
        plot_2d_generator
        plot_map
        plot_map_generator

        """

        # set kwds
        if kwds_scatter is None:
            kwds_scatter = {}
        if kwds_quiver is None:
            kwds_quiver = {}

        from mpl_toolkits.mplot3d.axes3d import Axes3D  # @UnusedImport

        # return dict of matplotlib objects
        obj = {}

        # create figure and axes
        if ax is None:
            fig = plt.figure()
            ax = fig.add_subplot(111, projection="3d")
        else:
            fig = ax.get_figure()

        # create PathCollection by scatter
        x, y, z = (self.v[x], self.v[y], self.v[z])
        pc = plt.scatter(x, y, zs=z, zdir="z", **kwds_scatter)

        obj["pc"] = pc

        # draw edges as arrows
        if edges is True:
            # get unique indices of edgeed nodes
            s = self.e.index.get_level_values(level=0).values
            t = self.e.index.get_level_values(level=1).values

            # xy position of sources, delta xy
            xs, ys, zs = (x.loc[s].values, y.loc[s].values, z.loc[s].values)
            xt, yt, zt = (x.loc[t].values, y.loc[t].values, z.loc[t].values)

            # upcast dtypes
            xs = np.array(xs, dtype=float)
            ys = np.array(ys, dtype=float)
            zs = np.array(zs, dtype=float)
            xt = np.array(xt, dtype=float)
            yt = np.array(yt, dtype=float)
            zt = np.array(zs, dtype=float)

            dx = xt - xs
            dy = yt - ys
            dz = zt - zs

            qu = ax.quiver(xs, ys, zs, dx, dy, dz, **kwds_quiver)

            obj["qu"] = qu

        return obj

    def plot_rects_label_numeric(self, label, xl, xr, colors=None, ax=None, **kwargs):
        """Work in progress!

        Plot rectangles given by `label_xl_xr_df`.

        Parameters
        ----------
        label_xl_xr_df : pd.DataFrame
            A pandas.DataFrame object with three columns, the first column
            containing the categorical variable (labels),
            the second column containing the left x values, the
            third column the right x values of the boxes.
        kwargs : keywords
            kwargs to pass to matplotlib.pyplot.vlines

        Returns
        -------
        obj : dict of matplotlib objects
            Keys are ['fig', 'ax', 'vlines']

        See also
        --------
        plot_rects_numeric_numeric

        """

        from matplotlib.collections import PolyCollection

        v = self.v[[label, xl, xr]]

        # return dict of matplotlib objects
        obj = {}

        # create figure and axes
        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = ax.get_figure()

        obj["fig"] = fig
        obj["ax"] = ax

        # include colors in dataframe for sorting
        if colors is not None:
            v["color"] = colors
        else:
            v["color"] = 1

        # rectangle coordinates
        xl, xr = (v[xl].values, v[xr].values)
        widths = xr - xl
        yb = v[label] - 0.6
        heights = np.ones(len(xr)) * 1.2

        recs = []
        for x, y, width, height in zip(xl, yb, widths, heights):
            recs.append(((x, y), (x, y + height), (x + width, y + height), (x + width, y)))

        # create poly collection of rectangles
        c = PolyCollection(recs, **kwargs)

        # set colors
        c.set_array(v["color"])
        obj["c"] = c

        # add PolyCollection
        ax.add_collection(c)

        # set yticklabels
        positions = np.arange(v[label].max() + 1)
        ax.set_yticks(positions)

        # set x/y lims
        dx = 0.05 * (xr.max() - xl.min())
        dy = 0.05 * (yb.max() + 1.2 - yb.min())
        ax.set_xlim((xl.min() - dx, xr.max() + dx))
        ax.set_ylim((yb.min() - dy, yb.max() + 1.2 + dy))

        return obj

    def plot_rects_numeric_numeric(self, yb, yt, xl, xr, colors=None, ax=None, **kwargs):
        """Work in progress!

        Create a raster plot of all components given by `yb_yt_xl_xr_df`.

        Parameters
        ----------
        yb_yt_xl_xr_df : pd.DataFrame
            A pandas.DataFrame object with four columns
        kwargs : keywords
            kwargs to pass to matplotlib.pyplot.vlines

        Returns
        -------
        obj : dict of matplotlib objects
            Keys are ['fig', 'ax', 'vlines']

        See also
        --------
        box_label_numeric

        """

        from matplotlib.collections import PolyCollection

        v = self.v[[yb, yt, xl, xr]]

        # return dict of matplotlib objects
        obj = {}

        # create figure and axes
        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = ax.get_figure()

        obj["fig"] = fig
        obj["ax"] = ax

        # include colors in dataframe for sorting
        if colors is not None:
            v["color"] = colors
        else:
            v["color"] = 1

        # rectangle coordinates
        xl, xr = (v[xl].values, v[xr].values)
        widths = xr - xl
        yb, yt = (v[yb], v[yt])
        heights = yt - yb

        recs = []
        for x, y, width, height in zip(xl, yb, widths, heights):
            recs.append(((x, y), (x, y + height), (x + width, y + height), (x + width, y)))

        # create poly collection of rectangles
        c = PolyCollection(recs, **kwargs)

        # set colors
        c.set_array(v["color"])

        obj["c"] = c

        # add PolyCollection
        ax.add_collection(c)

        # set x/y lims
        dx = 0.05 * (xr.max() - xl.min())
        dy = 0.05 * (yt.max() - yb.min())
        ax.set_xlim((xl.min() - dx, xr.max() + dx))
        ax.set_ylim((yb.min() - dy, yt.max() + dy))

        return obj

    def plot_raster(self, label, time="time", ax=None, **kwargs):
        """Work in progress!

        Create a raster plot of all nodes given by `supernode_id_time_df`.

        Parameters
        ----------
        supernode_id_time_df : pd.DataFrame
            A pandas.DataFrame object with two columns, the first column
            containing the labels, the second column containing the
            times of the nodes.
        kwargs : keywords
            kwargs to pass to matplotlib.pyplot.vlines

        Returns
        -------
        obj : dict of matplotlib objects
            Keys are ['fig', 'ax', 'vlines']

        """

        # return dict of matplotlib objects
        obj = {}

        # create figure and axes
        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = ax.get_figure()

        obj["fig"] = fig
        obj["ax"] = ax

        # sort by labels
        v = self.v[[label, time]].sort_values(label)

        # unique labels
        labels = v[label].unique()

        # create raster plot
        vlines = []
        for i, l in enumerate(labels):
            vlines.append(ax.vlines(v[v[label] == l][time].values, i + 0.5, i + 1.5, **kwargs))

        obj["vlines"] = vlines

        # set labels as yticklabels
        positions = np.arange(1, len(labels) + 1)
        labels = labels

        ax.set_yticks(positions)
        ax.set_yticklabels(labels)

        # set x/y lims
        dx = 0.05 * (v[time].max() - v[time].min())
        dy = 0.05 * (positions.max() - positions.min())
        ax.set_xlim((v[time].min() - dx, v[time].max() + dx))
        ax.set_ylim((positions.min() - dy, positions.max() + dy))

        # set x/y label
        ax.set_xlabel("time")
        ax.set_ylabel(label)

        return obj

[docs]    @staticmethod
    def plot_hist(x, bins=10, log_bins=False, density=False, floor=False, ax=None, **kwargs):
        """Plot a histogram (or pdf) of x.

        Compute and plot the histogram (or probability density) of x. Keyword
        arguments are passed to plt.plot. See parameters and ``np.histogram``
        for details.

        Parameters
        ----------
        x : array_like
            The data from which a frequency distribution is plot.

        bins : int or array_like, optional (default=10)
            If ``bins`` is an int, it determines the number of bins to create.
            If ``log_bins`` is True, this number determines the (approximate)
            number of bins to create for each magnitude. For linear bins, it is
            the number of bins for the whole range of values. If ``bins`` is a
            sequence, it defines the bin edges, including the rightmost edge,
            allowing for non-uniform bin widths.

        log_bins : bool, optional (default=False)
            Whether to use logarithmically or linearly spaced bins.

        density : bool, optional (default=False)
            If False, the result will contain the number of samples in each
            bin.  If True, the result is the value of the probability *density*
            function at the bin, normalized such that the *integral* over the
            range is 1. Note that the sum of the histogram values will not be
            equal to 1 unless bins of unity width are chosen; it is not a
            probability *mass* function.

        floor : bool, optional (default=False)
            Whether to floor the bin edges to the closest integers. Only has an
            effect if ``bins`` is an int.

        ax : matplotlib axes object, optional (default=None)
            An axes instance to use.

        Returns
        -------
        ax : matplotlib axes object
            A matplotlib axes instance.

        hist : np.ndarray
            The values of the histogram. See ``density``.

        bin_edges : np.ndarray
            The edges of the bins.

        """

        # create bins
        if _is_array_like(bins):
            bin_edges = bins
        else:
            bin_edges = _create_bin_edges(x, bins, log_bins, floor)

        # counts and bin_centers
        hist, _ = np.histogram(x, bin_edges, density=density)
        hist = hist.astype(float)
        hist[hist == 0] = np.nan
        bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2.0

        # plot
        if ax is None:
            _, ax = plt.subplots()

        ax.plot(bin_centers, hist, **kwargs)

        # set scales
        if log_bins:
            ax.set_xscale("log")

        return ax, hist, bin_edges

[docs]    @staticmethod
    def plot_logfile(logfile):
        """Plot a logfile.

        Plot a benchmark logfile created by ``create_edges`` or
        ``create_edges_ft``.

        Parameters
        ----------
        logfile : str
            The filename of the logfile.

        Returns
        -------
        obj : dict
            Depending on the logfile, return a dict of matplotlib objects with
            a subset of the following keys: ['fig', 'ax', 'pc_n', 'pc_e',
            'cb_n', 'cb_e']

        """

        # load data from log file
        logfile = np.loadtxt(logfile)

        # return dict of matplotlib objects
        obj = {}

        #     0            1            2            3
        # exceeded | nr.of pairs | nr.of edges | comp.time

        # partition by non-/exceeded max_pairs
        log_n = logfile[logfile[:, 0] == 0]
        log_e = logfile[logfile[:, 0] == 1]

        fig, ax = plt.subplots()

        obj["fig"] = fig
        obj["ax"] = ax

        # scatter normal iterations
        pc_n = ax.scatter(
            log_n[:, 1], log_n[:, 3], s=20, c=np.log10(log_n[:, 2] + 1), marker="o", label="normal", edgecolors="none"
        )

        obj["pc_n"] = pc_n

        # scatter max_pair exceeded iterations
        pc_e = ax.scatter(
            log_e[:, 1],
            log_e[:, 3],
            s=30,
            c=np.log10(log_e[:, 2] + 1),
            cmap="gist_earth",
            marker="D",
            label="max_pairs exceeded",
        )

        obj["pc_e"] = pc_e

        msg = "iterations: {:d} | total time: {:.2f}s | total edges: {:d}"
        ax.set_title(msg.format(len(logfile), logfile[:, 3].sum(), int(logfile[:, 2].sum())))
        ax.set_xlabel("nr.of pairs")
        ax.set_ylabel("comp.time (s)")
        ax.set_xscale("log")
        ax.legend(loc=2)
        ax.grid()

        if len(log_e) == 0:
            cb_n = fig.colorbar(pc_n, fraction=0.03)
            cb_n.set_label("log10(n_edges) (normal)")

            fig.tight_layout()

            obj["cb_n"] = cb_n

        elif len(log_n) == 0:
            cb_e = fig.colorbar(pc_e, fraction=0.03)
            cb_e.set_label("log10(n_edges) (exceeded)")

            fig.tight_layout()

            obj["cb_e"] = cb_e

        else:
            cb_e = fig.colorbar(pc_e, fraction=0.03)
            cb_n = fig.colorbar(pc_n, fraction=0.03)
            cb_n.set_label("log10(n_edges) (normal)")
            cb_e.set_label("log10(n_edges) (exceeded)")

            fig.tight_layout()

            obj["cb_n"] = cb_n
            obj["cb_e"] = cb_e

        return obj

    @property
    def n(self):
        """The number of nodes"""
        if hasattr(self, "v"):
            if isinstance(self.v, pd.HDFStore):
                if len(self.v.keys()) == 1:
                    n = self.v.get_storer(self.v.keys()[0]).nrows
                else:
                    n = "NA"
            else:
                n = len(self.v)
        else:
            n = 0
        return n

    @property
    def m(self):
        """The number of edges"""
        if hasattr(self, "e"):
            m = len(self.e)
        else:
            m = 0
        return m

    @property
    def f(self):
        """Types of features and number of features of corresponding type."""
        if hasattr(self, "v"):
            if isinstance(self.v, pd.HDFStore):
                f = "NA"
            else:
                f = self.v.count()
        else:
            f = "there are no nodes"
        return f

    @property
    def r(self):
        """Types of relations and number of relations of corresponding type."""
        if hasattr(self, "e"):
            r = self.e.count()
        else:
            r = "there are no edges"
        return r

    def _plot_2d(
        self, is_map, x, y, edges, C, C_split_0, kwds_scatter, kwds_quiver, kwds_quiver_0, kwds_basemap, ax, m
    ):
        if is_map:
            from mpl_toolkits.basemap import Basemap

        # set kwds
        if kwds_basemap is None:
            kwds_basemap = {}
        else:
            kwds_basemap = kwds_basemap.copy()
        if kwds_scatter is None:
            kwds_scatter = {}
        else:
            kwds_scatter = kwds_scatter.copy()
        if kwds_quiver is None:
            kwds_quiver = {}
        else:
            kwds_quiver = kwds_quiver.copy()
        if kwds_quiver_0 is None:
            kwds_quiver_0 = {}
        else:
            kwds_quiver_0 = kwds_quiver_0.copy()

        # set draw order
        try:
            zorder_qu0 = kwds_quiver_0.pop("zorder")
        except KeyError:
            zorder_qu0 = 1
        try:
            zorder_qu = kwds_quiver.pop("zorder")
        except KeyError:
            zorder_qu = 2
        try:
            zorder_pc = kwds_scatter.pop("zorder")
        except KeyError:
            zorder_pc = 3

        # create dict for matplotlib objects
        obj = {}

        # create figure, axes (and basemap)
        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = ax.get_figure()

        obj["fig"] = fig
        obj["ax"] = ax

        if is_map and m is None:
            m = Basemap(ax=ax, **kwds_basemap)
            obj["m"] = m
        elif is_map and m is not None:
            obj["m"] = m

        # create PathCollection by scatter
        x_str = x
        y_str = y
        x, y = (self.v[x_str].values, self.v[y_str].values)
        if is_map:
            axm = m
            x, y = m(x, y)
            # bug in basemap, it changed dtypes
            x = np.array(x, dtype=float)
            y = np.array(y, dtype=float)
        else:
            axm = ax

        pc = axm.scatter(x, y, zorder=zorder_pc, **kwds_scatter)
        obj["pc"] = pc

        # draw edges as arrows
        if edges is True:
            # source- and target-indices
            s = self.e.index.get_level_values(level=0).values
            t = self.e.index.get_level_values(level=1).values

            # latlon position of sources and targets, vector components
            x, y = (self.v[x_str], self.v[y_str])
            if is_map:
                xs, ys = m(x.loc[s].values, y.loc[s].values)
                xt, yt = m(x.loc[t].values, y.loc[t].values)
            else:
                xs, ys = (x.loc[s].values, y.loc[s].values)
                xt, yt = (x.loc[t].values, y.loc[t].values)

            # upcast dtypes
            xs = np.array(xs, dtype=float)
            ys = np.array(ys, dtype=float)
            xt = np.array(xt, dtype=float)
            yt = np.array(yt, dtype=float)

            dx = xt - xs
            dy = yt - ys

            # bug in basemap, changed dtypes
            if is_map:
                dx = np.array(dx, dtype=float)
                dy = np.array(dy, dtype=float)

            # create quiver plot
            if C_split_0 is not None:
                try:
                    color = kwds_quiver_0.pop("color")
                except KeyError:
                    color = "k"
                try:
                    headwidth = kwds_quiver_0.pop("headwidth")
                except KeyError:
                    headwidth = 1

                C = C_split_0

                qu_0 = axm.quiver(
                    xs[C == 0],
                    ys[C == 0],
                    dx[C == 0],
                    dy[C == 0],
                    color=color,
                    angles="xy",
                    scale_units="xy",
                    scale=1,
                    headwidth=headwidth,
                    zorder=zorder_qu0,
                    **kwds_quiver_0,
                )

                qu = axm.quiver(
                    xs[C != 0],
                    ys[C != 0],
                    dx[C != 0],
                    dy[C != 0],
                    C[C != 0],
                    angles="xy",
                    scale_units="xy",
                    scale=1,
                    zorder=zorder_qu,
                    **kwds_quiver,
                )

                obj["qu_0"] = qu_0
                obj["qu"] = qu

            elif C is not None:
                qu = axm.quiver(
                    xs, ys, dx, dy, C, angles="xy", scale_units="xy", scale=1, zorder=zorder_qu, **kwds_quiver
                )
                obj["qu"] = qu

            else:
                qu = axm.quiver(xs, ys, dx, dy, angles="xy", scale_units="xy", scale=1, zorder=zorder_qu, **kwds_quiver)
                obj["qu"] = qu

        return obj

    def _plot_2d_generator(
        self, is_map, x, y, by, edges, C, C_split_0, kwds_basemap, kwds_scatter, kwds_quiver, kwds_quiver_0, passable_ax
    ):
        if is_map:
            from mpl_toolkits.basemap import Basemap

        # set kwargs
        if kwds_basemap is None:
            kwds_basemap = {}
        else:
            kwds_basemap = kwds_basemap.copy()
        if kwds_scatter is None:
            kwds_scatter = {}
        else:
            kwds_scatter = kwds_scatter.copy()
        if kwds_quiver is None:
            kwds_quiver = {}
        else:
            kwds_quiver = kwds_quiver.copy()
        if kwds_quiver_0 is None:
            kwds_quiver_0 = {}
        else:
            kwds_quiver_0 = kwds_quiver_0.copy()

        # set draw order
        try:
            zorder_qu0 = kwds_quiver_0.pop("zorder")
        except KeyError:
            zorder_qu0 = 1
        try:
            zorder_qu = kwds_quiver.pop("zorder")
        except KeyError:
            zorder_qu = 2
        try:
            zorder_pc = kwds_scatter.pop("zorder")
        except KeyError:
            zorder_pc = 3

        # assert there's no color given in quiver kwds
        if kwds_quiver is not None:
            assert "color" not in kwds_quiver.keys(), "use 'C' or 'C_split_0' for setting the color of quiver!"

        # select v
        v = self.v[_flatten([x, y, by])]

        # store array_like kwargs in dataframe for filtering
        #     and change standard kwargs

        # set xlim/ylim for non map plots
        if not is_map:
            dx = (v[x].max() - v[x].min()) * 0.05
            dy = (v[y].max() - v[y].min()) * 0.05
            xlim = (v[x].min() - dx, v[x].max() + dx)
            ylim = (v[y].min() - dy, v[y].max() + dy)

        # scatter size
        try:
            pc_s = kwds_scatter.pop("s")
            v["pc_s"] = pc_s
        except KeyError:
            v["pc_s"] = 20

        # scatter color
        try:
            pc_c = kwds_scatter.pop("c")
            v["pc_c"] = pc_c
        except KeyError:
            pc_c = None
            v["pc_c"] = 1

        # scatter vmin/vmax -> entire min/max
        try:
            pc_vmin = kwds_scatter.pop("vmin")
        except KeyError:
            if pc_c is not None:
                try:
                    pc_vmin = pc_c.min()
                except AttributeError:
                    pc_vmin = None
            else:
                pc_vmin = None
        try:
            pc_vmax = kwds_scatter.pop("vmax")
        except KeyError:
            if pc_c is not None:
                try:
                    pc_vmax = pc_c.max()
                except AttributeError:
                    pc_vmax = None
            else:
                pc_vmax = None

        # quiver colors, and quiver clim -> entire min/max
        if edges is True:
            if C_split_0 is not None:
                e = pd.DataFrame(data={"Cqu0": C_split_0}, index=self.e.index)
                try:
                    qu_clim = kwds_quiver.pop("clim")
                except KeyError:
                    qu_clim = [C_split_0.min(), C_split_0.max()]

            elif C is not None:
                e = pd.DataFrame(data={"C": C}, index=self.e.index)
                try:
                    qu_clim = kwds_quiver.pop("clim")
                except KeyError:
                    qu_clim = [C.min(), C.max()]

            else:
                e = pd.DataFrame(index=self.e.index)
                qu_clim = None

            # change standard kwargs for quiver_0 at [C_split_0 == 0]
            try:
                color = kwds_quiver_0.pop("color")
            except KeyError:
                color = "k"
            try:
                qu_0_headwidth = kwds_quiver_0.pop("headwidth")
            except KeyError:
                qu_0_headwidth = 1

        else:
            e = None

        # generator loop
        x_str = x
        y_str = y
        gv = v.groupby(by)
        for labels, group in gv:

            def obj(ax=None, m=None):
                """Plot nodes and corresponding edges.

                See ``plot_2d_generator`` or ``plot_map_generator`` for
                details.

                Parameters
                ----------
                ax : matplotlib axes object, optional (default=None)
                    An axes instance to use.

                m : Basemap object, optional (default=None)
                    A mpl_toolkits.basemap.Basemap instance to use.

                Returns
                -------
                obj : dict
                    Return a dict of matplotlib objects.

                """

                # store group labels in obj
                obj = {"group": labels}

                # filter edges by group
                g = DeepGraph(group, e)
                g.update_edges()

                # create figure, axes (and basemap)
                if ax is None:
                    fig, ax = plt.subplots()
                else:
                    fig = ax.get_figure()

                obj["fig"] = fig
                obj["ax"] = ax

                if is_map and m is None:
                    m = Basemap(ax=ax, **kwds_basemap.copy())
                    obj["m"] = m
                elif is_map and m is not None:
                    obj["m"] = m
                else:
                    ax.set_xlim(xlim)
                    ax.set_ylim(ylim)

                # create PathCollection by scatter
                x, y = (g.v[x_str].values, g.v[y_str].values)
                if is_map:
                    axm = m
                    x, y = m(x, y)
                else:
                    axm = ax

                # need to change colors to list, in case they're not numbers
                pc = axm.scatter(
                    x,
                    y,
                    c=g.v.pc_c.values.tolist(),
                    s=g.v.pc_s.values,
                    vmin=pc_vmin,
                    vmax=pc_vmax,
                    zorder=zorder_pc,
                    **kwds_scatter,
                )
                obj["pc"] = pc

                # draw edges as arrows
                if edges is True:
                    # source- and target-indices
                    s = g.e.index.get_level_values(level=0).values
                    t = g.e.index.get_level_values(level=1).values

                    # xy position of sources and targets, vector components
                    x, y = (g.v[x_str], g.v[y_str])
                    if is_map:
                        xs, ys = m(x.loc[s].values, y.loc[s].values)
                        xt, yt = m(x.loc[t].values, y.loc[t].values)
                    else:
                        xs, ys = (x.loc[s].values, y.loc[s].values)
                        xt, yt = (x.loc[t].values, y.loc[t].values)

                    # upcast dtypes
                    xs = np.array(xs, dtype=float)
                    ys = np.array(ys, dtype=float)
                    xt = np.array(xt, dtype=float)
                    yt = np.array(yt, dtype=float)

                    dx = xt - xs
                    dy = yt - ys

                    # bug in basemap, changes dtypes
                    if is_map:
                        dx = np.array(dx, dtype=float)
                        dy = np.array(dy, dtype=float)

                    if C_split_0 is not None:
                        C = g.e.Cqu0.values

                        qu_0 = axm.quiver(
                            xs[C == 0],
                            ys[C == 0],
                            dx[C == 0],
                            dy[C == 0],
                            color=color,
                            angles="xy",
                            scale_units="xy",
                            scale=1,
                            headwidth=qu_0_headwidth,
                            zorder=zorder_qu0,
                            **kwds_quiver_0,
                        )

                        qu = axm.quiver(
                            xs[C != 0],
                            ys[C != 0],
                            dx[C != 0],
                            dy[C != 0],
                            C[C != 0],
                            angles="xy",
                            scale_units="xy",
                            scale=1,
                            clim=qu_clim,
                            zorder=zorder_qu,
                            **kwds_quiver,
                        )

                        obj["qu_0"] = qu_0
                        obj["qu"] = qu

                    elif C is not None:
                        C = g.e.C.values

                        qu = axm.quiver(
                            xs,
                            ys,
                            dx,
                            dy,
                            C,
                            angles="xy",
                            scale_units="xy",
                            scale=1,
                            clim=qu_clim,
                            zorder=zorder_qu,
                            **kwds_quiver,
                        )

                        obj["qu"] = qu

                    else:
                        qu = axm.quiver(
                            xs, ys, dx, dy, angles="xy", scale_units="xy", scale=1, zorder=zorder_qu, **kwds_quiver
                        )

                        obj["qu"] = qu

                return obj

            if passable_ax:
                yield obj
            else:
                yield obj()