Source code for pyoints.classification

# BEGIN OF LICENSE NOTE
# This file is part of Pyoints.
# Copyright (c) 2018, Sebastian Lamprecht, Trier University,
# lamprecht@uni-trier.de
#
# Pyoints is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pyoints is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Pyoints. If not, see <https://www.gnu.org/licenses/>.
# END OF LICENSE NOTE
"""Collection of functions to classify or reclassify values or cluster values.
"""

import numpy as np
from collections import defaultdict

from . import (
    assertion,
    nptools,
)

from .misc import print_rounded


[docs]def classes_to_dict(
        classification,
        ids=None,
        min_size=1,
        max_size=np.inf,
        missing_value=-1):
    """Converts a list of class indices to a dictionary of grouped classes.

    Parameters
    ----------
    classification : array_like(shape=(n))
        Array of class indices.
    ids : optional, array_like(int, shape=(n))
        Indices to specify a subset of `classification`. If none, the indices
        are numbered consecutively.
    min_size,max_size : optional, positive int
        Minimum and maximum desired size of a class to be kept in the result.
    missing_value : optional, object
        Default value for unclassified values.

    Returns
    -------
    dict
        Dictionary of class indices. The dictionary keys represent the class
        ids, while the values represent the indices in the original array.

    See Also
    --------
    dict_to_classes
        Dictionary representation of `classification`.

    Examples
    --------

    >>> classes = ['cat', 'cat', 'dog', 'bird', 'dog', 'bird', 'cat', 'dog']
    >>> class_dict = classes_to_dict(classes)
    >>> print(sorted(class_dict))
    ['bird', 'cat', 'dog']
    >>> print_rounded(class_dict['cat'])
    [0 1 6]

    >>> classes = [0, 0, 1, 2, 1, 0, 3, 3, 5, 3, 2, 1, 0]
    >>> print(classes_to_dict(classes))
    {0: [0, 1, 5, 12], 1: [2, 4, 11], 2: [3, 10], 3: [6, 7, 9], 5: [8]}

    """
    if not nptools.isarray(classification):
        raise ValueError("'classification' needs to be an array like object")

    if ids is None:
        ids = range(len(classification))
    elif not len(ids) == len(classification):
        m = "'classification' and 'ids' must have the same length"
        raise ValueError(m)

    # set values
    classes = defaultdict(list)
    for id, cId in zip(ids, classification):
        if not cId == missing_value:
            classes[cId].append(id)

    # check size
    if min_size > 1 or max_size < np.inf:
        for key in list(classes.keys()):
            s = len(classes[key])
            if s < min_size or s > max_size:
                del classes[key]

    return dict(classes)


[docs]def dict_to_classes(
        classes_dict,
        n,
        min_size=1,
        max_size=np.inf,
        missing_value=-1):
    """Converts a dictionary of classes into a list of classes.

    Parameters
    ----------
    classes_dict : dict
        Dictionary of class indices.
    n : positive int
        Desired size of the output array. It must be at least the size of the
        maximum class index.
    min_size,max_size : optional, positive int
        Minimum and maximum desired size of a class to be kept in the result.
    missing_value : optional, object
        Default value for unclassified values.

    Returns
    -------
    np.ndarray(int, shape=(n))
        Array representation of `classes_dict`.

    See Also
    --------
    classes_to_dict

    Notes
    -----
    Only a minimal input validation is provided.

    Examples
    --------

    Alphanumeric classes.

    >>> classes_dict = {'bird': [0, 1, 5, 4], 'dog': [3, 6, 8], 'cat': [7]}
    >>> print(dict_to_classes(classes_dict, 10, missing_value=''))
    ['bird' 'bird' '' 'dog' 'bird' 'bird' 'dog' 'cat' 'dog' '']

    Omit small classes.

    >>> print(dict_to_classes(classes_dict, 10, min_size=2))
    ['bird' 'bird' -1 'dog' 'bird' 'bird' 'dog' -1 'dog' -1]

    Numeric classes.

    >>> classes_dict = {0: [0, 1, 5], 1: [3, 6], 2: [7, 2]}
    >>> print(dict_to_classes(classes_dict, 9))
    [0 0 2 1 -1 0 1 2 -1]

    """
    # type validation
    if not isinstance(classes_dict, dict):
        raise TypeError("dictionary required")
    if not isinstance(n, int) and n > 0:
        raise ValueError("'n' needs to be an integer greater zero")

    # prepare output
    dtype = np.array(classes_dict.values()).dtype
    classification = np.full(n, missing_value, dtype=dtype)

    # assign classes
    for cId, ids in classes_dict.items():
        if len(ids) >= min_size and len(ids) <= max_size:
            classification[ids] = cId

    return classification


[docs]def split_by_breaks(values, breaks):
    """Classifies values by ranges.

    Parameters
    ----------
    values : array_like(Number, shape=(n))
        Values to classify.
    breaks : array_like(Number, shape=(m))
        Series of value ranges.

    Returns
    -------
    classification : np.ndarray(int, shape=(n))
        Desired class affiliation of `values`. A value of `classification[i]`
        equal to `k` means that 'values[i]' is in range
        `[breaks[k], breaks[k][`

    Examples
    --------

    >>> values = np.arange(10)
    >>> breaks = [0.5, 5, 7.5]
    >>> classes = split_by_breaks(values, breaks)
    >>> print_rounded(classes)
    [0 1 1 1 1 2 2 2 3 3]

    """
    values = assertion.ensure_numvector(values)
    breaks = assertion.ensure_numvector(breaks)
    return np.digitize(values, breaks)


[docs]def rename_dict(d, ids=None):
    """Assigns new key names to a dictionary.

    Parameters
    ----------
    d : dict
        Dictionary to rename.
    ids : optional, array_like(shape=(len(d)))
        Desired key names. If none, the keys are numbered consecutively.

    Returns
    -------
    dict
        Dictionary with new names.

    Examples
    --------

    >>> d = {1: [0, 1], 2: None, 3: 'text'}
    >>> renamed_dict = rename_dict(d, ['first', 'second', 'last'])
    >>> print(sorted(renamed_dict))
    ['first', 'last', 'second']

    """
    if not isinstance(d, dict):
        raise TypeError("dictionary required")
    if ids is None:
        ids = range(len(d))
    elif not len(ids) == len(d):
        raise ValueError("same number of keys required")

    return dict(zip(ids, d.values()))


[docs]def majority(classes, empty_value=-1):
    """Finds most frequent class or value in an array.

    Parameters
    ----------
    classes : array_like(object, shape=(n))
        Classes or values to check.
    empty_value : optional, object
        Class value in case that no decision can be made.

    Returns
    -------
    object
        Most frequent class.

    Notes
    -----
    Only a limited input validation is provided.

    Examples
    --------

    Find majority class.

    >>> classes =['cat', 'dog', 'dog', 'bird', 'cat', 'dog']
    >>> print(majority(classes))
    dog

    >>> classes =[1, 8, 9, 0, 0, 2, 4, 2, 4, 3, 2, 3, 5, 6]
    >>> print_rounded(majority(classes))
    2

    No decision possible.

    >>> classes =[1, 2, 3, 4, 4, 3]
    >>> print_rounded(majority(classes))
    -1

    """
    if not nptools.isarray(classes):
        raise ValueError("'classes' needs to be an array like object")

    k = len(classes) // 2
    count = defaultdict(lambda: 0)
    for cId in classes:
        count[cId] += 1
        if count[cId] > k:
            return cId

    for key in count:
        if count[key] > count[cId]:
            cId = key

    for key in count:
        if count[key] == count[cId] and key != cId:
            return empty_value
    return cId