Source code for dataset.correlations

import math
import scipy.stats as ss
import numpy as np
import pandas as pd
from collections import Counter


[docs]def convert(data, to): converted = None if to == 'array': if isinstance(data, np.ndarray): converted = data elif isinstance(data, pd.Series): converted = data.values elif isinstance(data, list): converted = np.array(data) elif isinstance(data, pd.DataFrame): converted = data.as_matrix() elif to == 'list': if isinstance(data, list): converted = data elif isinstance(data, pd.Series): converted = data.values.tolist() elif isinstance(data, np.ndarray): converted = data.tolist() elif to == 'dataframe': if isinstance(data, pd.DataFrame): converted = data elif isinstance(data, np.ndarray): converted = pd.DataFrame(data) else: raise ValueError("Unknown data conversion: {}".format(to)) if converted is None: raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data), to)) else: return converted
[docs]def conditional_entropy(x, y): """ Calculates the conditional entropy of x given y: S(x|y) Wikipedia: <https://en.wikipedia.org/wiki/Conditional_entropy> :param x: list / NumPy ndarray / Pandas Series A sequence of measurements :param y: list / NumPy ndarray / Pandas Series A sequence of measurements :return: float """ # entropy of x given y y_counter = Counter(y) xy_counter = Counter(list(zip(x, y))) total_occurrences = sum(y_counter.values()) entropy = 0.0 for xy in xy_counter.keys(): p_xy = xy_counter[xy] / total_occurrences p_y = y_counter[xy[1]] / total_occurrences entropy += p_xy * math.log(p_y/p_xy) return entropy
[docs]def cramers_v(x, y): """ Calculates Cramer's V statistic for categorical-categorical association. Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. This is a symmetric coefficient: V(x,y) = V(y,x) Original function taken from: https://stackoverflow.com/a/46498792/5863503 Wikipedia: <https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V> :param x: list / NumPy ndarray / Pandas Series A sequence of categorical measurements :param y: list / NumPy ndarray / Pandas Series A sequence of categorical measurements :return: float in the range of [0,1] """ confusion_matrix = pd.crosstab(x, y) chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2/n r, k = confusion_matrix.shape phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1)) rcorr = r-((r-1)**2)/(n-1) kcorr = k-((k-1)**2)/(n-1) return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))
[docs]def theils_u(x, y): """ Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association. This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about x, and 1 means y provides full information about x. This is an asymmetric coefficient: U(x,y) != U(y,x) Wikipedia: <https://en.wikipedia.org/wiki/Uncertainty_coefficient> :param x: list / NumPy ndarray / Pandas Series A sequence of categorical measurements :param y: list / NumPy ndarray / Pandas Series A sequence of categorical measurements :return: float in the range of [0,1] """ s_xy = conditional_entropy(x, y) x_counter = Counter(x) total_occurrences = sum(x_counter.values()) p_x = list(map(lambda n: n/total_occurrences, x_counter.values())) s_x = ss.entropy(p_x) if s_x == 0: return 1 else: return (s_x - s_xy) / s_x
[docs]def correlation_ratio(categories, measurements): """ Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association. Answers the question - given a continuous value of a measurement, is it possible to know which category is it associated with? Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means a category can be determined with absolute certainty. Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio :param categories: list / NumPy ndarray / Pandas Series A sequence of categorical measurements :param measurements: list / NumPy ndarray / Pandas Series A sequence of continuous measurements :return: float in the range of [0,1] """ categories = convert(categories, 'array') measurements = convert(measurements, 'array') fcat, _ = pd.factorize(categories) cat_num = np.max(fcat)+1 y_avg_array = np.zeros(cat_num) n_array = np.zeros(cat_num) for i in range(0, cat_num): cat_measures = measurements[np.argwhere(fcat == i).flatten()] n_array[i] = len(cat_measures) y_avg_array[i] = np.average(cat_measures) y_total_avg = np.sum(np.multiply(y_avg_array, n_array))/np.sum(n_array) numerator = np.sum(np.multiply(n_array, np.power(np.subtract( y_avg_array, y_total_avg), 2))) denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2)) if numerator == 0: eta = 0.0 else: eta = numerator/denominator return eta