Source code for pyoints.classification

# BEGIN OF LICENSE NOTE
# This file is part of Pyoints.
# Copyright (c) 2018, Sebastian Lamprecht, Trier University,
# lamprecht@uni-trier.de
#
# Pyoints is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pyoints is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Pyoints. If not, see <https://www.gnu.org/licenses/>.
# END OF LICENSE NOTE
"""Collection of functions to classify or reclassify values or cluster values.
"""

import numpy as np
from collections import defaultdict

from . import (
    assertion,
    nptools,
)

from .misc import print_rounded


[docs]def classes_to_dict( classification, ids=None, min_size=1, max_size=np.inf, missing_value=-1): """Converts a list of class indices to a dictionary of grouped classes. Parameters ---------- classification : array_like(shape=(n)) Array of class indices. ids : optional, array_like(int, shape=(n)) Indices to specify a subset of `classification`. If none, the indices are numbered consecutively. min_size,max_size : optional, positive int Minimum and maximum desired size of a class to be kept in the result. missing_value : optional, object Default value for unclassified values. Returns ------- dict Dictionary of class indices. The dictionary keys represent the class ids, while the values represent the indices in the original array. See Also -------- dict_to_classes Dictionary representation of `classification`. Examples -------- >>> classes = ['cat', 'cat', 'dog', 'bird', 'dog', 'bird', 'cat', 'dog'] >>> class_dict = classes_to_dict(classes) >>> print(sorted(class_dict)) ['bird', 'cat', 'dog'] >>> print_rounded(class_dict['cat']) [0 1 6] >>> classes = [0, 0, 1, 2, 1, 0, 3, 3, 5, 3, 2, 1, 0] >>> print(classes_to_dict(classes)) {0: [0, 1, 5, 12], 1: [2, 4, 11], 2: [3, 10], 3: [6, 7, 9], 5: [8]} """ if not nptools.isarray(classification): raise ValueError("'classification' needs to be an array like object") if ids is None: ids = range(len(classification)) elif not len(ids) == len(classification): m = "'classification' and 'ids' must have the same length" raise ValueError(m) # set values classes = defaultdict(list) for id, cId in zip(ids, classification): if not cId == missing_value: classes[cId].append(id) # check size if min_size > 1 or max_size < np.inf: for key in list(classes.keys()): s = len(classes[key]) if s < min_size or s > max_size: del classes[key] return dict(classes)
[docs]def dict_to_classes( classes_dict, n, min_size=1, max_size=np.inf, missing_value=-1): """Converts a dictionary of classes into a list of classes. Parameters ---------- classes_dict : dict Dictionary of class indices. n : positive int Desired size of the output array. It must be at least the size of the maximum class index. min_size,max_size : optional, positive int Minimum and maximum desired size of a class to be kept in the result. missing_value : optional, object Default value for unclassified values. Returns ------- np.ndarray(int, shape=(n)) Array representation of `classes_dict`. See Also -------- classes_to_dict Notes ----- Only a minimal input validation is provided. Examples -------- Alphanumeric classes. >>> classes_dict = {'bird': [0, 1, 5, 4], 'dog': [3, 6, 8], 'cat': [7]} >>> print(dict_to_classes(classes_dict, 10, missing_value='')) ['bird' 'bird' '' 'dog' 'bird' 'bird' 'dog' 'cat' 'dog' ''] Omit small classes. >>> print(dict_to_classes(classes_dict, 10, min_size=2)) ['bird' 'bird' -1 'dog' 'bird' 'bird' 'dog' -1 'dog' -1] Numeric classes. >>> classes_dict = {0: [0, 1, 5], 1: [3, 6], 2: [7, 2]} >>> print(dict_to_classes(classes_dict, 9)) [0 0 2 1 -1 0 1 2 -1] """ # type validation if not isinstance(classes_dict, dict): raise TypeError("dictionary required") if not isinstance(n, int) and n > 0: raise ValueError("'n' needs to be an integer greater zero") # prepare output dtype = np.array(classes_dict.values()).dtype classification = np.full(n, missing_value, dtype=dtype) # assign classes for cId, ids in classes_dict.items(): if len(ids) >= min_size and len(ids) <= max_size: classification[ids] = cId return classification
[docs]def split_by_breaks(values, breaks): """Classifies values by ranges. Parameters ---------- values : array_like(Number, shape=(n)) Values to classify. breaks : array_like(Number, shape=(m)) Series of value ranges. Returns ------- classification : np.ndarray(int, shape=(n)) Desired class affiliation of `values`. A value of `classification[i]` equal to `k` means that 'values[i]' is in range `[breaks[k], breaks[k][` Examples -------- >>> values = np.arange(10) >>> breaks = [0.5, 5, 7.5] >>> classes = split_by_breaks(values, breaks) >>> print_rounded(classes) [0 1 1 1 1 2 2 2 3 3] """ values = assertion.ensure_numvector(values) breaks = assertion.ensure_numvector(breaks) return np.digitize(values, breaks)
[docs]def rename_dict(d, ids=None): """Assigns new key names to a dictionary. Parameters ---------- d : dict Dictionary to rename. ids : optional, array_like(shape=(len(d))) Desired key names. If none, the keys are numbered consecutively. Returns ------- dict Dictionary with new names. Examples -------- >>> d = {1: [0, 1], 2: None, 3: 'text'} >>> renamed_dict = rename_dict(d, ['first', 'second', 'last']) >>> print(sorted(renamed_dict)) ['first', 'last', 'second'] """ if not isinstance(d, dict): raise TypeError("dictionary required") if ids is None: ids = range(len(d)) elif not len(ids) == len(d): raise ValueError("same number of keys required") return dict(zip(ids, d.values()))
[docs]def majority(classes, empty_value=-1): """Finds most frequent class or value in an array. Parameters ---------- classes : array_like(object, shape=(n)) Classes or values to check. empty_value : optional, object Class value in case that no decision can be made. Returns ------- object Most frequent class. Notes ----- Only a limited input validation is provided. Examples -------- Find majority class. >>> classes =['cat', 'dog', 'dog', 'bird', 'cat', 'dog'] >>> print(majority(classes)) dog >>> classes =[1, 8, 9, 0, 0, 2, 4, 2, 4, 3, 2, 3, 5, 6] >>> print_rounded(majority(classes)) 2 No decision possible. >>> classes =[1, 2, 3, 4, 4, 3] >>> print_rounded(majority(classes)) -1 """ if not nptools.isarray(classes): raise ValueError("'classes' needs to be an array like object") k = len(classes) // 2 count = defaultdict(lambda: 0) for cId in classes: count[cId] += 1 if count[cId] > k: return cId for key in count: if count[key] > count[cId]: cId = key for key in count: if count[key] == count[cId] and key != cId: return empty_value return cId