Source code for fsds.default

"""Main module."""
#flatiron_stats


[docs]def welch_t(a, b):
    import numpy as np
    import scipy.stats as stats
    import scipy   
    """ Calculate Welch's t statistic for two samples. """

    numerator = a.mean() - b.mean()
    
    # “ddof = Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, 
    #  where N represents the number of elements. By default ddof is zero.
    
    denominator = np.sqrt(a.var(ddof=1)/a.size + b.var(ddof=1)/b.size)
    
    return np.abs(numerator/denominator)


[docs]def welch_df(a, b):
    
    import numpy as np
    import scipy.stats as stats
    import scipy   
     # Calculate the effective degrees of freedom for two samples. This function returns the degrees of freedom
    
    s1 = a.var(ddof=1) 
    s2 = b.var(ddof=1)
    n1 = a.size
    n2 = b.size
    
    numerator = (s1/n1 + s2/n2)**2
    denominator = (s1/ n1)**2/(n1 - 1) + (s2/ n2)**2/(n2 - 1)
    
    return numerator/denominator


[docs]def p_value_welch_ttest(a, b, two_sided=False):
    """Calculates the p-value for Welch's t-test given two samples.
    By default, the returned p-value is for a one-sided t-test. 
    Set the two-sided parameter to True if you wish to perform a two-sided t-test instead.
    """
    import numpy as np
    import scipy.stats as stats
    import scipy   
    
    t = welch_t(a, b)
    df = welch_df(a, b)
    
    p = 1-stats.t.cdf(np.abs(t), df)
    
    if two_sided:
        return 2*p
    else:
        return p
    
    


[docs]def evaluate_PDF(rv, x=4):
    '''Input: a random variable object, standard deviation
    output : x and y values for the normal distribution
    '''
    import numpy as np
    import scipy.stats as stats
    import scipy       
    # Identify the mean and standard deviation of random variable 
    mean = rv.mean()
    std = rv.std()

    # Use numpy to calculate evenly spaced numbers over the specified interval (4 sd) and generate 100 samples.
    xs = np.linspace(mean - x*std, mean + x*std, 100)
    
    # Calculate the peak of normal distribution i.e. probability density. 
    ys = rv.pdf(xs)

    return xs, ys # Return calculated values


[docs]def overlap_superiority(group1, group2, n=1000):
    """Estimates overlap and superiority based on a sample.
    
    group1: scipy.stats rv object
    group2: scipy.stats rv object
    n: sample size
    """
    import numpy as np
    import scipy.stats as stats
    import scipy   
    # Get a sample of size n from both groups
    group1_sample = group1.rvs(n)
    group2_sample = group2.rvs(n)
    
    # Identify the threshold between samples
    thresh = (group1.mean() + group2.mean()) / 2
    print(thresh)
    
    # Calculate no. of values above and below for group 1 and group 2 respectively
    above = sum(group1_sample < thresh)
    below = sum(group2_sample > thresh)
    
    # Calculate the overlap
    overlap = (above + below) / n
    
    # Calculate probability of superiority
    superiority = sum(x > y for x, y in zip(group1_sample, group2_sample)) / n

    return overlap, superiority


[docs]def Cohen_d(group1, group2, correction = False):
    """Compute Cohen's d
    d = (group1.mean()-group2.mean())/pool_variance.
    pooled_variance= (n1 * var1 + n2 * var2) / (n1 + n2)

    Args:
        group1 (Series or NumPy array): group 1 for calculating d
        group2 (Series or NumPy array): group 2 for calculating d
        correction (bool): Apply equation correction if N<50. Default is False. 
            - Url with small ncorrection equation: 
                - https://www.statisticshowto.datasciencecentral.com/cohens-d/ 
    Returns:
        d (float): calculated d value
         
    INTERPRETATION OF COHEN's D: 
    > Small effect = 0.2
    > Medium Effect = 0.5
    > Large Effect = 0.8
    
    """
    import scipy.stats as stats
    import scipy   
    import numpy as np
    N = len(group1)+len(group2)
    diff = group1.mean() - group2.mean()

    n1, n2 = len(group1), len(group2)
    var1 = group1.var()
    var2 = group2.var()

    # Calculate the pooled threshold as shown earlier
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    
    # Calculate Cohen's d statistic
    d = diff / np.sqrt(pooled_var)
    
    ## Apply correction if needed
    if (N < 50) & (correction==True):
        d=d * ((N-3)/(N-2.25))*np.sqrt((N-2)/N)
    
    return d


[docs]def plot_pdfs(cohen_d=2):
    """Plot PDFs for distributions that differ by some number of stds.
    
    cohen_d: number of standard deviations between the means
    """
    import numpy as np
    import scipy.stats as stats
    import scipy   
    import scipy
    import matplotlib.pyplot as plt
    group1 = scipy.stats.norm(0, 1)
    group2 = scipy.stats.norm(cohen_d, 1)
    xs, ys = evaluate_PDF(group1)
    plt.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)

    xs, ys = evaluate_PDF(group2)
    plt.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
    
    o, s = overlap_superiority(group1, group2)
    print('overlap', o)
    print('superiority', s)
    
    
# def find_outliers(df,col=None,report=True):
#     """
# `    Uses Tukey's Interquartile Range Method to find outliers.
#     - threshold =  1.5 * IQR
#         - Lower threshold = [25% quartile] - threshold
#         - Upper threshold = [75% quartile] + threshold
#     - Outliers are below the lower or above the upper threshold.
    
#     Returns a series of T/F for each row for slicing outliers: df[idx_out]
    
    
#     EXAMPLE USE: 
#     >> idx_outs = find_outliers_df(df,col='AdjustedCompensation')
#     >> good_data = data[~idx_outs].copy()
#     """
#     import numpy as np
#     import scipy.stats as stats
#     import pandas as pd
#     if isinstance(df,pd.DataFrame) & (col is None):
#         raise Exception('Must provide a column name if passing a DataFrame.')
#     elif col is not None:
#         data = df[col].copy()
#     else:
#         data = df.copy()

#     res = data.describe()
#     iqr_thresh = 1.5 * (res['75%'] - res['25%'])
#     upper = res['75%']+iqr_thresh
#     lower = res['25%']-iqr_thresh

#     idx_outs = ((data>upper)|(data<lower))

#     def get_true_count(idx_outs):
#         if True in idx_outs.value_counts().index:
#             return idx_outs.value_counts()[True]
#         else:
#             return 0
        
#     if report:
#         print(f'[i] outliers for {col}: {get_true_count(idx_outs)} / {len(data)} total.')
#     return idx_outs

[docs]def find_outliers_IQR(data,col=None):
    """
    Use Tukey's Method of outlier removal AKA InterQuartile-Range Rule
    and return boolean series where True indicates it is an outlier.
    - Calculates the range between the 75% and 25% quartiles
    - Outliers fall outside upper and lower limits, using a treshold of  1.5*IQR the 75% and 25% quartiles.

    IQR Range Calculation:    
        res = df.describe()
        IQR = res['75%'] -  res['25%']
        lower_limit = res['25%'] - 1.5*IQR
        upper_limit = res['75%'] + 1.5*IQR

    Args:
        data (DataFrame,Series,or ndarray): data to test for outliers.
        col (str): If passing a DataFrame, must specify column to use.

    Returns:
        [boolean Series]: A True/False for each row use to slice outliers.
        
    EXAMPLE USE: 
    >> idx_outs = find_outliers_df(df,col='AdjustedCompensation')
    >> good_data = data[~idx_outs].copy()
    
    """
    import pandas as pd
    from scipy import stats
    import numpy as np

    if isinstance(data, pd.DataFrame):
        if col is None:
            raise Exception('If passing a DataFrame, must provide col=')
        else:
            data = data[col]
    elif isinstance(data,np.ndarray):
        data= pd.Series(data)

    elif isinstance(data,pd.Series):
        pass
    else:
        raise Exception('data must be a DataFrame, Series, or np.ndarray')
    
    res = data.describe()
        
    
    IQR = res['75%'] -  res['25%']
    lower_limit = res['25%'] - 1.5*IQR
    upper_limit = res['75%'] + 1.5*IQR

    idx_outs = (data>upper_limit) | (data<lower_limit) 

    return idx_outs


[docs]def find_outliers_Z(data,col=None):
    """Use scipy to calcualte absoliute Z-scores 
    and return boolean series where True indicates it is an outlier

    Args:
        data (DataFrame,Series,or ndarray): data to test for outliers.
        col (str): If passing a DataFrame, must specify column to use.

    Returns:
        [boolean Series]: A True/False for each row use to slice outliers.
        
    EXAMPLE USE: 
    >> idx_outs = find_outliers_df(df,col='AdjustedCompensation')
    >> good_data = data[~idx_outs].copy()
    """
    import pandas as pd
    from scipy import stats
    import numpy as np

    if isinstance(data, pd.DataFrame):
        if col is None:
            raise Exception('If passing a DataFrame, must provide col=')
        else:
            data = data[col]
    elif isinstance(data,np.ndarray):
        data= pd.Series(data)

    elif isinstance(data,pd.Series):
        pass
    else:
        raise Exception('data must be a DataFrame, Series, or np.ndarray')
    
    z = np.abs(stats.zscore(data))
    idx_outliers = np.where(z>3,True,False)
    return idx_outliers
Source code for fsds.default

fs_ds

Navigation

Related Topics