# Source code for mgcpy.hypothesis_tests.transforms

import numpy as np
from mgcpy.independence_tests.dcorr import DCorr
from sklearn import preprocessing

[docs]def k_sample_transform(x, y, is_y_categorical=False):
'''
Transform to represent a k-sample test as an independence test

:param X: is interpreted as either:

- a [n*n] distance matrix, a square matrix with zeros on diagonal for n samples OR
- a [n*p] data matrix, a matrix with n samples in p dimensions
:type X: 2D numpy.array

:param Y: is interpreted as either:

- a [n*n] distance matrix, a square matrix with zeros on diagonal for n samples OR
- a [n*p] data matrix, a matrix with n samples in p dimensions
- a [n*1] label matrix, categorical data for X, if is_y_categorical is set to True
:type Y: 2D numpy.array

:param is_y_categorical: if set to True, Y has categorical data ans is a labels array for X,
else, it is a plain data matrix
:type is_y_categorical: boolean

:return:

- :u: a concatenated data matrix of dimensions [2*n, p]
- :v: a label matrix for u, which indicates to which category each data entry in u belongs to
:rtype: list
'''
if not is_y_categorical:
assert x.shape[1] == y.shape[1], "Matrices X and Y need to be of same dimension p"
else:
assert x.shape[0] == y.shape[0] and y.shape[1] == 1, "Matrices X and Y need to be of dimensions [n, p], [n, 1]"

if not is_y_categorical:
u = np.concatenate([x, y], axis=0)
v = np.concatenate([np.repeat(1, x.shape[0]), np.repeat(2, y.shape[0])], axis=0)
else:
u = x
v = preprocessing.LabelEncoder().fit_transform(y.flatten()) + 1

if len(u.shape) == 1:
u = u[..., np.newaxis]
if len(v.shape) == 1:
v = v[..., np.newaxis]

return u, v

[docs]def paired_two_sample_transform(x, y):
'''
Transform to represent a paired two-sample test as an independence test
Steps:
- combine x and y to get the joint_distribution
- sample n pairs from the joint_distribution
- compute the eucledian distance between the sampled n pairs, which is randomly_sampled_pairs_distance
- compute the eucledian distance between the actual x and y, which is actual_pairs_distance
- compute the two sample transformed matrices of randomly_sampled_pairs_distance and actual_pairs_distance
:param X: is interpreted as either:
- a [n*n] distance matrix, a square matrix with zeros on diagonal for n samples OR
- a [n*p] data matrix, a matrix with n samples in p dimensions
:type X: 2D numpy.array
:param Y: is interpreted as either:
- a [n*n] distance matrix, a square matrix with zeros on diagonal for n samples OR
- a [n*p] data matrix, a matrix with n samples in p dimensions
:type Y: 2D numpy.array
:return:
- :u: a data matrix of dimensions [2*n, p]
- :v: a label matrix for u, which indicates to which category each data entry in u belongs to
:rtype: list
'''
assert x.shape == y.shape, "Matrices X and Y need to be of same dimensions [n, p]"

joint_distribution = np.concatenate([x, y], axis=0)  # (2n, p) shape

pairwise_sampled_xy = np.array([joint_distribution[np.random.randint(joint_distribution.shape[0], size=2), :]
for _ in range(x.shape[0])])  # (n, 2, p) shape
pairwise_sampled_x = pairwise_sampled_xy[:, 0]  # (n, p) shape
pairwise_sampled_y = pairwise_sampled_xy[:, 1]  # (n, p) shape

# compute the eucledian distances
randomly_sampled_pairs_distance = np.linalg.norm(pairwise_sampled_x - pairwise_sampled_y, axis=1)
actual_pairs_distance = np.linalg.norm(x - y, axis=1)

u, v = k_sample_transform(randomly_sampled_pairs_distance, actual_pairs_distance)

return u, v

[docs]def paired_two_sample_test_dcorr(x, y, which_test="biased", compute_distance_matrix=None, is_fast=False):
'''
Compute paired two sample test's DCorr test_statistic

:param X: is interpreted as either:

- a [n*n] distance matrix, a square matrix with zeros on diagonal for n samples OR
- a [n*p] data matrix, a matrix with n samples in p dimensions
:type X: 2D numpy.array

:param Y: is interpreted as either:

- a [n*n] distance matrix, a square matrix with zeros on diagonal for n samples OR
- a [n*p] data matrix, a matrix with n samples in p dimensions
:type Y: 2D numpy.array

:return: paired two sample DCorr test_statistic
:rtype: float
'''
assert x.shape == y.shape, "Matrices X and Y need to be of same dimensions [n, p]"

dcorr = DCorr(is_paired=True, which_test=which_test, compute_distance_matrix=compute_distance_matrix)

return dcorr.p_value(x, y, is_fast=is_fast)