"""Main module."""
#flatiron_stats
[docs]def welch_t(a, b):
import numpy as np
import scipy.stats as stats
import scipy
""" Calculate Welch's t statistic for two samples. """
numerator = a.mean() - b.mean()
# “ddof = Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
# where N represents the number of elements. By default ddof is zero.
denominator = np.sqrt(a.var(ddof=1)/a.size + b.var(ddof=1)/b.size)
return np.abs(numerator/denominator)
[docs]def welch_df(a, b):
import numpy as np
import scipy.stats as stats
import scipy
# Calculate the effective degrees of freedom for two samples. This function returns the degrees of freedom
s1 = a.var(ddof=1)
s2 = b.var(ddof=1)
n1 = a.size
n2 = b.size
numerator = (s1/n1 + s2/n2)**2
denominator = (s1/ n1)**2/(n1 - 1) + (s2/ n2)**2/(n2 - 1)
return numerator/denominator
[docs]def p_value_welch_ttest(a, b, two_sided=False):
"""Calculates the p-value for Welch's t-test given two samples.
By default, the returned p-value is for a one-sided t-test.
Set the two-sided parameter to True if you wish to perform a two-sided t-test instead.
"""
import numpy as np
import scipy.stats as stats
import scipy
t = welch_t(a, b)
df = welch_df(a, b)
p = 1-stats.t.cdf(np.abs(t), df)
if two_sided:
return 2*p
else:
return p
[docs]def evaluate_PDF(rv, x=4):
'''Input: a random variable object, standard deviation
output : x and y values for the normal distribution
'''
import numpy as np
import scipy.stats as stats
import scipy
# Identify the mean and standard deviation of random variable
mean = rv.mean()
std = rv.std()
# Use numpy to calculate evenly spaced numbers over the specified interval (4 sd) and generate 100 samples.
xs = np.linspace(mean - x*std, mean + x*std, 100)
# Calculate the peak of normal distribution i.e. probability density.
ys = rv.pdf(xs)
return xs, ys # Return calculated values
[docs]def overlap_superiority(group1, group2, n=1000):
"""Estimates overlap and superiority based on a sample.
group1: scipy.stats rv object
group2: scipy.stats rv object
n: sample size
"""
import numpy as np
import scipy.stats as stats
import scipy
# Get a sample of size n from both groups
group1_sample = group1.rvs(n)
group2_sample = group2.rvs(n)
# Identify the threshold between samples
thresh = (group1.mean() + group2.mean()) / 2
print(thresh)
# Calculate no. of values above and below for group 1 and group 2 respectively
above = sum(group1_sample < thresh)
below = sum(group2_sample > thresh)
# Calculate the overlap
overlap = (above + below) / n
# Calculate probability of superiority
superiority = sum(x > y for x, y in zip(group1_sample, group2_sample)) / n
return overlap, superiority
[docs]def Cohen_d(group1, group2, correction = False):
"""Compute Cohen's d
d = (group1.mean()-group2.mean())/pool_variance.
pooled_variance= (n1 * var1 + n2 * var2) / (n1 + n2)
Args:
group1 (Series or NumPy array): group 1 for calculating d
group2 (Series or NumPy array): group 2 for calculating d
correction (bool): Apply equation correction if N<50. Default is False.
- Url with small ncorrection equation:
- https://www.statisticshowto.datasciencecentral.com/cohens-d/
Returns:
d (float): calculated d value
INTERPRETATION OF COHEN's D:
> Small effect = 0.2
> Medium Effect = 0.5
> Large Effect = 0.8
"""
import scipy.stats as stats
import scipy
import numpy as np
N = len(group1)+len(group2)
diff = group1.mean() - group2.mean()
n1, n2 = len(group1), len(group2)
var1 = group1.var()
var2 = group2.var()
# Calculate the pooled threshold as shown earlier
pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
# Calculate Cohen's d statistic
d = diff / np.sqrt(pooled_var)
## Apply correction if needed
if (N < 50) & (correction==True):
d=d * ((N-3)/(N-2.25))*np.sqrt((N-2)/N)
return d
[docs]def plot_pdfs(cohen_d=2):
"""Plot PDFs for distributions that differ by some number of stds.
cohen_d: number of standard deviations between the means
"""
import numpy as np
import scipy.stats as stats
import scipy
import scipy
import matplotlib.pyplot as plt
group1 = scipy.stats.norm(0, 1)
group2 = scipy.stats.norm(cohen_d, 1)
xs, ys = evaluate_PDF(group1)
plt.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)
xs, ys = evaluate_PDF(group2)
plt.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
o, s = overlap_superiority(group1, group2)
print('overlap', o)
print('superiority', s)
# def find_outliers(df,col=None,report=True):
# """
# ` Uses Tukey's Interquartile Range Method to find outliers.
# - threshold = 1.5 * IQR
# - Lower threshold = [25% quartile] - threshold
# - Upper threshold = [75% quartile] + threshold
# - Outliers are below the lower or above the upper threshold.
# Returns a series of T/F for each row for slicing outliers: df[idx_out]
# EXAMPLE USE:
# >> idx_outs = find_outliers_df(df,col='AdjustedCompensation')
# >> good_data = data[~idx_outs].copy()
# """
# import numpy as np
# import scipy.stats as stats
# import pandas as pd
# if isinstance(df,pd.DataFrame) & (col is None):
# raise Exception('Must provide a column name if passing a DataFrame.')
# elif col is not None:
# data = df[col].copy()
# else:
# data = df.copy()
# res = data.describe()
# iqr_thresh = 1.5 * (res['75%'] - res['25%'])
# upper = res['75%']+iqr_thresh
# lower = res['25%']-iqr_thresh
# idx_outs = ((data>upper)|(data<lower))
# def get_true_count(idx_outs):
# if True in idx_outs.value_counts().index:
# return idx_outs.value_counts()[True]
# else:
# return 0
# if report:
# print(f'[i] outliers for {col}: {get_true_count(idx_outs)} / {len(data)} total.')
# return idx_outs
[docs]def find_outliers_IQR(data,col=None):
"""
Use Tukey's Method of outlier removal AKA InterQuartile-Range Rule
and return boolean series where True indicates it is an outlier.
- Calculates the range between the 75% and 25% quartiles
- Outliers fall outside upper and lower limits, using a treshold of 1.5*IQR the 75% and 25% quartiles.
IQR Range Calculation:
res = df.describe()
IQR = res['75%'] - res['25%']
lower_limit = res['25%'] - 1.5*IQR
upper_limit = res['75%'] + 1.5*IQR
Args:
data (DataFrame,Series,or ndarray): data to test for outliers.
col (str): If passing a DataFrame, must specify column to use.
Returns:
[boolean Series]: A True/False for each row use to slice outliers.
EXAMPLE USE:
>> idx_outs = find_outliers_df(df,col='AdjustedCompensation')
>> good_data = data[~idx_outs].copy()
"""
import pandas as pd
from scipy import stats
import numpy as np
if isinstance(data, pd.DataFrame):
if col is None:
raise Exception('If passing a DataFrame, must provide col=')
else:
data = data[col]
elif isinstance(data,np.ndarray):
data= pd.Series(data)
elif isinstance(data,pd.Series):
pass
else:
raise Exception('data must be a DataFrame, Series, or np.ndarray')
res = data.describe()
IQR = res['75%'] - res['25%']
lower_limit = res['25%'] - 1.5*IQR
upper_limit = res['75%'] + 1.5*IQR
idx_outs = (data>upper_limit) | (data<lower_limit)
return idx_outs
[docs]def find_outliers_Z(data,col=None):
"""Use scipy to calcualte absoliute Z-scores
and return boolean series where True indicates it is an outlier
Args:
data (DataFrame,Series,or ndarray): data to test for outliers.
col (str): If passing a DataFrame, must specify column to use.
Returns:
[boolean Series]: A True/False for each row use to slice outliers.
EXAMPLE USE:
>> idx_outs = find_outliers_df(df,col='AdjustedCompensation')
>> good_data = data[~idx_outs].copy()
"""
import pandas as pd
from scipy import stats
import numpy as np
if isinstance(data, pd.DataFrame):
if col is None:
raise Exception('If passing a DataFrame, must provide col=')
else:
data = data[col]
elif isinstance(data,np.ndarray):
data= pd.Series(data)
elif isinstance(data,pd.Series):
pass
else:
raise Exception('data must be a DataFrame, Series, or np.ndarray')
z = np.abs(stats.zscore(data))
idx_outliers = np.where(z>3,True,False)
return idx_outliers