Source code for fsds.default

"""Main module."""
#flatiron_stats


[docs]def welch_t(a, b): import numpy as np import scipy.stats as stats import scipy """ Calculate Welch's t statistic for two samples. """ numerator = a.mean() - b.mean() # “ddof = Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, # where N represents the number of elements. By default ddof is zero. denominator = np.sqrt(a.var(ddof=1)/a.size + b.var(ddof=1)/b.size) return np.abs(numerator/denominator)
[docs]def welch_df(a, b): import numpy as np import scipy.stats as stats import scipy # Calculate the effective degrees of freedom for two samples. This function returns the degrees of freedom s1 = a.var(ddof=1) s2 = b.var(ddof=1) n1 = a.size n2 = b.size numerator = (s1/n1 + s2/n2)**2 denominator = (s1/ n1)**2/(n1 - 1) + (s2/ n2)**2/(n2 - 1) return numerator/denominator
[docs]def p_value_welch_ttest(a, b, two_sided=False): """Calculates the p-value for Welch's t-test given two samples. By default, the returned p-value is for a one-sided t-test. Set the two-sided parameter to True if you wish to perform a two-sided t-test instead. """ import numpy as np import scipy.stats as stats import scipy t = welch_t(a, b) df = welch_df(a, b) p = 1-stats.t.cdf(np.abs(t), df) if two_sided: return 2*p else: return p
[docs]def evaluate_PDF(rv, x=4): '''Input: a random variable object, standard deviation output : x and y values for the normal distribution ''' import numpy as np import scipy.stats as stats import scipy # Identify the mean and standard deviation of random variable mean = rv.mean() std = rv.std() # Use numpy to calculate evenly spaced numbers over the specified interval (4 sd) and generate 100 samples. xs = np.linspace(mean - x*std, mean + x*std, 100) # Calculate the peak of normal distribution i.e. probability density. ys = rv.pdf(xs) return xs, ys # Return calculated values
[docs]def overlap_superiority(group1, group2, n=1000): """Estimates overlap and superiority based on a sample. group1: scipy.stats rv object group2: scipy.stats rv object n: sample size """ import numpy as np import scipy.stats as stats import scipy # Get a sample of size n from both groups group1_sample = group1.rvs(n) group2_sample = group2.rvs(n) # Identify the threshold between samples thresh = (group1.mean() + group2.mean()) / 2 print(thresh) # Calculate no. of values above and below for group 1 and group 2 respectively above = sum(group1_sample < thresh) below = sum(group2_sample > thresh) # Calculate the overlap overlap = (above + below) / n # Calculate probability of superiority superiority = sum(x > y for x, y in zip(group1_sample, group2_sample)) / n return overlap, superiority
[docs]def Cohen_d(group1, group2, correction = False): """Compute Cohen's d d = (group1.mean()-group2.mean())/pool_variance. pooled_variance= (n1 * var1 + n2 * var2) / (n1 + n2) Args: group1 (Series or NumPy array): group 1 for calculating d group2 (Series or NumPy array): group 2 for calculating d correction (bool): Apply equation correction if N<50. Default is False. - Url with small ncorrection equation: - https://www.statisticshowto.datasciencecentral.com/cohens-d/ Returns: d (float): calculated d value INTERPRETATION OF COHEN's D: > Small effect = 0.2 > Medium Effect = 0.5 > Large Effect = 0.8 """ import scipy.stats as stats import scipy import numpy as np N = len(group1)+len(group2) diff = group1.mean() - group2.mean() n1, n2 = len(group1), len(group2) var1 = group1.var() var2 = group2.var() # Calculate the pooled threshold as shown earlier pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) # Calculate Cohen's d statistic d = diff / np.sqrt(pooled_var) ## Apply correction if needed if (N < 50) & (correction==True): d=d * ((N-3)/(N-2.25))*np.sqrt((N-2)/N) return d
[docs]def plot_pdfs(cohen_d=2): """Plot PDFs for distributions that differ by some number of stds. cohen_d: number of standard deviations between the means """ import numpy as np import scipy.stats as stats import scipy import scipy import matplotlib.pyplot as plt group1 = scipy.stats.norm(0, 1) group2 = scipy.stats.norm(cohen_d, 1) xs, ys = evaluate_PDF(group1) plt.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7) xs, ys = evaluate_PDF(group2) plt.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7) o, s = overlap_superiority(group1, group2) print('overlap', o) print('superiority', s)
# def find_outliers(df,col=None,report=True): # """ # ` Uses Tukey's Interquartile Range Method to find outliers. # - threshold = 1.5 * IQR # - Lower threshold = [25% quartile] - threshold # - Upper threshold = [75% quartile] + threshold # - Outliers are below the lower or above the upper threshold. # Returns a series of T/F for each row for slicing outliers: df[idx_out] # EXAMPLE USE: # >> idx_outs = find_outliers_df(df,col='AdjustedCompensation') # >> good_data = data[~idx_outs].copy() # """ # import numpy as np # import scipy.stats as stats # import pandas as pd # if isinstance(df,pd.DataFrame) & (col is None): # raise Exception('Must provide a column name if passing a DataFrame.') # elif col is not None: # data = df[col].copy() # else: # data = df.copy() # res = data.describe() # iqr_thresh = 1.5 * (res['75%'] - res['25%']) # upper = res['75%']+iqr_thresh # lower = res['25%']-iqr_thresh # idx_outs = ((data>upper)|(data<lower)) # def get_true_count(idx_outs): # if True in idx_outs.value_counts().index: # return idx_outs.value_counts()[True] # else: # return 0 # if report: # print(f'[i] outliers for {col}: {get_true_count(idx_outs)} / {len(data)} total.') # return idx_outs
[docs]def find_outliers_IQR(data,col=None): """ Use Tukey's Method of outlier removal AKA InterQuartile-Range Rule and return boolean series where True indicates it is an outlier. - Calculates the range between the 75% and 25% quartiles - Outliers fall outside upper and lower limits, using a treshold of 1.5*IQR the 75% and 25% quartiles. IQR Range Calculation: res = df.describe() IQR = res['75%'] - res['25%'] lower_limit = res['25%'] - 1.5*IQR upper_limit = res['75%'] + 1.5*IQR Args: data (DataFrame,Series,or ndarray): data to test for outliers. col (str): If passing a DataFrame, must specify column to use. Returns: [boolean Series]: A True/False for each row use to slice outliers. EXAMPLE USE: >> idx_outs = find_outliers_df(df,col='AdjustedCompensation') >> good_data = data[~idx_outs].copy() """ import pandas as pd from scipy import stats import numpy as np if isinstance(data, pd.DataFrame): if col is None: raise Exception('If passing a DataFrame, must provide col=') else: data = data[col] elif isinstance(data,np.ndarray): data= pd.Series(data) elif isinstance(data,pd.Series): pass else: raise Exception('data must be a DataFrame, Series, or np.ndarray') res = data.describe() IQR = res['75%'] - res['25%'] lower_limit = res['25%'] - 1.5*IQR upper_limit = res['75%'] + 1.5*IQR idx_outs = (data>upper_limit) | (data<lower_limit) return idx_outs
[docs]def find_outliers_Z(data,col=None): """Use scipy to calcualte absoliute Z-scores and return boolean series where True indicates it is an outlier Args: data (DataFrame,Series,or ndarray): data to test for outliers. col (str): If passing a DataFrame, must specify column to use. Returns: [boolean Series]: A True/False for each row use to slice outliers. EXAMPLE USE: >> idx_outs = find_outliers_df(df,col='AdjustedCompensation') >> good_data = data[~idx_outs].copy() """ import pandas as pd from scipy import stats import numpy as np if isinstance(data, pd.DataFrame): if col is None: raise Exception('If passing a DataFrame, must provide col=') else: data = data[col] elif isinstance(data,np.ndarray): data= pd.Series(data) elif isinstance(data,pd.Series): pass else: raise Exception('data must be a DataFrame, Series, or np.ndarray') z = np.abs(stats.zscore(data)) idx_outliers = np.where(z>3,True,False) return idx_outliers