pyleida.stats
The module 'pyleida.stats' provides functions to execute statistical analyses on the dynamical system theory metrics.
Expand source code
"""The module 'pyleida.stats' provides functions
to execute statistical analyses on the dynamical
system theory metrics."""
from ._stats import (
ks_distance,
_compute_stats,
scatter_pvalues,
permtest_ind,
permtest_rel,
hedges_g
)
__all__ = [
"ks_distance",
"_compute_stats",
"scatter_pvalues",
"permtest_ind",
"permtest_rel",
"hedges_g"
]
Functions
def _compute_stats(dynamics_data, paired_tests=False, n_perm=5000, alternative='two-sided', save_results=True, path=None)-
Performs the statistical analysis of dwell times and occupancies for each cluster (i.e., phase-locking state) of each K partition.
Params:
dynamics_data : dict. Output of 'compute_dynamics_metrics' function. Contains the values of a given dynamical systems theory metric for each K partition.
paired_tests : bool. Default: False Specify if groups are independent or related/paired, to run the correct test.
n_perm : int. Number of permutations.
alternative : str. Default: 'two-sided'. Specify the hypothesis to test. Options: 'two-sided','greater','less'.
save_results : bool. Whether to save results on local folder.
path : str. Specify the path in which the results will be saved if 'save_results' was set to True.
Expand source code
def _compute_stats(dynamics_data,paired_tests=False,n_perm=5_000,alternative='two-sided',save_results=True,path=None): """ Performs the statistical analysis of dwell times and occupancies for each cluster (i.e., phase-locking state) of each K partition. Params: -------- dynamics_data : dict. Output of 'compute_dynamics_metrics' function. Contains the values of a given dynamical systems theory metric for each K partition. paired_tests : bool. Default: False Specify if groups are independent or related/paired, to run the correct test. n_perm : int. Number of permutations. alternative : str. Default: 'two-sided'. Specify the hypothesis to test. Options: 'two-sided','greater','less'. save_results : bool. Whether to save results on local folder. path : str. Specify the path in which the results will be saved if 'save_results' was set to True. """ #validation of input arguments parameters alternative_options = ['two-sided','greater','less'] if alternative not in alternative_options: raise ValueError(f"Valid 'alternative' options are {alternative_options}.") stats_all = {} ks = list(dynamics_data['occupancies'].keys()) for metric in ['dwell_times','occupancies']: stats_all[metric] = {} for k in ks: data = dynamics_data[metric][k] data = data.iloc[:,1:] #drop the column with subject_ids conditions = np.unique(data.condition) #get conditions N_conditions = conditions.size results_stacked = [] #for each combination of conditions for conds in combinations(conditions,2): data_ = data[data.condition.isin(conds)] #keep data from current 2 conditions #compute statistics if paired_tests: stats_results = permtest_rel( data_, class_column='condition', alternative=alternative, n_perm=n_perm ) else: stats_results = permtest_ind( data_, class_column='condition', alternative=alternative, n_perm=n_perm ) results_stacked.append(stats_results) if N_conditions>2: metric_results = pd.concat(results_stacked,axis=0).reset_index(drop=True) else: metric_results = pd.DataFrame(stats_results) metric_results.insert(0,'k',int([i for i in k.replace('_',' ').split() if i.isdigit()][0])) stats_all[metric][k] = metric_results if save_results: try: data_path = f'{path}/dynamics_metrics' #path in which the dynamical systems theory metrics for each k were saved. metric_results.to_csv(f'{data_path}/{k}/{metric}_stats.csv',sep='\t',index=False) except: print("Warning: there was a problem when saving the results to local folder.") print('*The statistical analysis has finished.') return stats_all def hedges_g(x1, x2, paired=False)-
Computes Hedges's g (effect size). To understand the magnitude of the detected intergroup differences independently of the sample size, the effect size is estimated using Hedge's statistic (Hedges, 1981). The use of this measure is based on its appropriateness to easure the effect size for the difference between means and to account for the size of the sample from each group.
Params:
x1 : ndarray with shape (N_samples). Observations of 1st condition/group.
x2 : ndarray with shape (N_samples). Observations of 2nd condition/group.
paired : bool. Whether conditions/groups are paired or independent.
Returns:
g : float. Hedge's effect size.
Expand source code
def hedges_g(x1,x2,paired=False): """ Computes Hedges's g (effect size). To understand the magnitude of the detected intergroup differences independently of the sample size, the effect size is estimated using Hedge's statistic (Hedges, 1981). The use of this measure is based on its appropriateness to easure the effect size for the difference between means and to account for the size of the sample from each group. Params: ------- x1 : ndarray with shape (N_samples). Observations of 1st condition/group. x2 : ndarray with shape (N_samples). Observations of 2nd condition/group. paired : bool. Whether conditions/groups are paired or independent. Returns: -------- g : float. Hedge's effect size. """ #sample sizes n1 = x1.size n2 = x2.size #degrees of freedom dof = n1+n2-2 #variances var1 = np.var(x1) var2 = np.var(x2) #difference in means m1 = np.mean(x1) m2 = np.mean(x2) diff_mean = np.abs(m1-m2) #pooled standard deviation #s1 = np.std(x1) #s2 = np.std(x2) #Hedges's g if not paired: s_pooled = np.sqrt( (((n1-1)*var1)+((n2-1)*var2))/dof ) g = diff_mean/s_pooled else: g = diff_mean / np.sqrt((var1+var2) / 2) return g def ks_distance(txt1, txt2, plot=False)-
Compute the 2-sample Kolmogorov-Smirnov test of goodness-of-fit (aka distance) between distributions of two (or groups of) Time x Time matrices. This technique can be used, e.g., to compare matrices between groups/conditions or to determine the best fit between a simulated FC matrix and a real/empirical FC matrix.
Params:
txt1 : ndarray with shape (N_time_points, N_time_points) or (N_time_points, N_time_points, N_subjects).
txt2 : ndarray with shape (N_time_points, N_time_points) or (N_time_points, N_time_points, N_subjects).
Returns:
distance : float Computed distances between matrices.
pval : float P-value of the statistical test.
Expand source code
def ks_distance(txt1,txt2,plot=False): """ Compute the 2-sample Kolmogorov-Smirnov test of goodness-of-fit (aka distance) between distributions of two (or groups of) Time x Time matrices. This technique can be used, e.g., to compare matrices between groups/conditions or to determine the best fit between a simulated FC matrix and a real/empirical FC matrix. Params: ------- txt1 : ndarray with shape (N_time_points, N_time_points) or (N_time_points, N_time_points, N_subjects). txt2 : ndarray with shape (N_time_points, N_time_points) or (N_time_points, N_time_points, N_subjects). Returns: -------- distance : float Computed distances between matrices. pval : float P-value of the statistical test. """ if txt1.ndim>2: txt1_,txt2_ = [],[] for sub in range(txt1.shape[-1]): txt1_.extend(txt1[:,:,sub][np.triu_indices_from(txt1[:,:,sub],k=1)]) txt2_.extend(txt2[:,:,sub][np.triu_indices_from(txt2[:,:,sub],k=1)]) txt1_, txt2_ = np.array(txt1_),np.array(txt2_) else: txt1_ = txt1[np.triu_indices_from(txt1,k=1)] txt2_ = txt2[np.triu_indices_from(txt2,k=1)] distance,pval = ks_2samp(txt1_,txt2_) if plot: plt.figure() sns.distplot(txt1_) sns.distplot(txt2_) plt.tight_layout() plt.show() return distance, pval def permtest_ind(data, class_column=None, n_perm=5000, alternative='two-sided')-
Compute a permutation test on two independent groups for each variable or feature in 'data', and additionally computes a Bonferroni-corrected alpha.
Params:
data : pd.dataframe. Contains the dataset with the metric values we want to compare. E.g.: the fractional occupancies.
class_colum : str. Specify the name of the column that contains the class/group/session/condition information.
n_perm : int. Default 5000. Select the number of permutations to perform.
alternative : str. Default='two-sided'. Select the test type. Options are 'less', 'two-sided', 'greater'.
Returns:
results : pd.dataframe. Contains the results of the permutation tests.
Expand source code
def permtest_ind(data,class_column=None,n_perm=5_000,alternative='two-sided'): """ Compute a permutation test on two independent groups for each variable or feature in 'data', and additionally computes a Bonferroni-corrected alpha. Params: ------- data : pd.dataframe. Contains the dataset with the metric values we want to compare. E.g.: the fractional occupancies. class_colum : str. Specify the name of the column that contains the class/group/session/condition information. n_perm : int. Default 5000. Select the number of permutations to perform. alternative : str. Default='two-sided'. Select the test type. Options are 'less', 'two-sided', 'greater'. Returns: -------- results : pd.dataframe. Contains the results of the permutation tests. """ #Validation of input data if class_column is None: raise ValueError("You must specify the 'class_column'.") elif class_column not in data.columns: raise ValueError(f"The 'class_column' '{class_column}' not founded in 'data'!") if not isinstance(n_perm,int): raise TypeError("'n_perm' must be an integer!") features = [col for col in data if col!=class_column] #list with variables to be tested n_tests = len(features) #number of tests that will be executed groups = np.unique(data[class_column]) #get the groups names. results = [] #list to save results for col in features: x1 = data[data[class_column]==groups[0]][col].values #data of first group. x2 = data[data[class_column]==groups[1]][col].values #data of the other group. #running Levene's test _,p_levene = levene(x1,x2,center='mean') #running the permutation test test = ttest_ind( x1, x2, alternative=alternative, permutations=n_perm, equal_var=True if p_levene>0.05 else False ) #computing effect size eff = hedges_g(x1,x2) results.append({ #'k': k, 'variable':col, 'group_1':groups[0], 'group_2':groups[-1], 'statistic':test.statistic, 'p-value':test.pvalue, 'test':'t-test' if p_levene>0.05 else 'welch', 'effect_size':eff }) results = pd.DataFrame(results) #Bonferroni correction for multiple testing comparison results['alpha_Bonferroni'] = 0.05/n_tests results['reject_null'] = [True if p<(0.05/n_tests) else False for p in results['p-value'].values] return results def permtest_rel(data, class_column=None, n_perm=5000, alternative='two-sided')-
Compute a permutation test on two related-paired groups for each variable or feature in 'data', and additionally computes a Bonferroni-corrected alpha.
Params:
data : pd.dataframe. Contains the dataset with the metric values we want to compare. E.g.: the fractional occupancies.
class_colum : str. Specify the name of the column that contains the class/group/session/condition information.
n_perm : int. Default 5000. Select the number of permutations to perform.
alternative : str. Default='two-sided'. Select the test type. Options are 'less', 'two-sided', 'greater'.
Returns:
results : pd.dataframe. Contains the results of the permutation test.
Expand source code
def permtest_rel(data,class_column=None,n_perm=5_000,alternative='two-sided'): """ Compute a permutation test on two related-paired groups for each variable or feature in 'data', and additionally computes a Bonferroni-corrected alpha. Params: ------- data : pd.dataframe. Contains the dataset with the metric values we want to compare. E.g.: the fractional occupancies. class_colum : str. Specify the name of the column that contains the class/group/session/condition information. n_perm : int. Default 5000. Select the number of permutations to perform. alternative : str. Default='two-sided'. Select the test type. Options are 'less', 'two-sided', 'greater'. Returns: -------- results : pd.dataframe. Contains the results of the permutation test. """ #Validation of input data if class_column is None: raise ValueError("You must specify the 'class_column'.") elif class_column not in data.columns: raise ValueError(f"The 'class_column' '{class_column}' is not present in 'data'!") if not isinstance(n_perm,int): raise TypeError("'n_perm' must be an integer!") features = [col for col in data if col!=class_column] #list with variables to be tested n_tests = len(features) #number of tests that will be executed groups = np.unique(data[class_column]) #get the groups names. results = [] #list to save results rng = np.random.default_rng() for col in features: x1 = data[data[class_column]==groups[0]][col].values #data of first group. x2 = data[data[class_column]==groups[1]][col].values #data of the other group. #running the permutation test test = permutation_test( (x1, x2), _statistic, n_resamples=n_perm, vectorized=True, alternative=alternative, permutation_type='samples', random_state=rng ) #computing effect size eff = hedges_g(x1,x2,paired=True) results.append({ #'k': k, 'variable':col, 'group_1':groups[0], 'group_2':groups[-1], 'statistic':test.statistic, 'p-value':test.pvalue, 'effect_size':eff }) results = pd.DataFrame(results) #Bonferroni correction for multiple testing comparison results['alpha_Bonferroni'] = 0.05/n_tests results['reject_null'] = [True if p<(0.05/n_tests) else False for p in results['p-value'].values] return results def scatter_pvalues(pooled_stats, metric='occupancies', fill_areas=True, darkstyle=False)-
Create a scatter plot showing the computed p-values for each cluster in each clustering partition ('k'). In addition, the plot shows the significance thresholds defined by the standard alpha value (0.05) and the Bonferroni-corrected alpha for each k (0.05/k).
Params:
pooled_stats : pandas.dataframe. Contain the computed statistics for each cluster and for each k.
metric : str. Optional. Specify the provided metric. Just used for plot title. If None, no title is shown.
fill_areas : bool. Whether to fill with color the areas defined by the two significance thresholds.
Expand source code
def scatter_pvalues(pooled_stats,metric='occupancies',fill_areas=True,darkstyle=False): """ Create a scatter plot showing the computed p-values for each cluster in each clustering partition ('k'). In addition, the plot shows the significance thresholds defined by the standard alpha value (0.05) and the Bonferroni-corrected alpha for each k (0.05/k). Params: ------- pooled_stats : pandas.dataframe. Contain the computed statistics for each cluster and for each k. metric : str. Optional. Specify the provided metric. Just used for plot title. If None, no title is shown. fill_areas : bool. Whether to fill with color the areas defined by the two significance thresholds. """ pooled_stats.rename(columns={'alpha_Bonferroni':'bonf','p-value': 'p'},inplace=True) K_min = min(np.unique(pooled_stats.k)) K_max = max(np.unique(pooled_stats.k)) alpha3 = np.sum(np.arange(K_min,K_max+1,1)) #Create list assigning a color to each pvalue. color_list = [] for pval,bonf in zip(pooled_stats.p,pooled_stats.bonf): color_list.append('mediumseagreen' if 0.05/alpha3<pval<bonf else 'firebrick' if 0.05>pval>bonf else 'royalblue' if pval<0.05/alpha3 else ('black' if not darkstyle else 'white') ) #adding data to bonferroni alphas so we can plot from K-min-1 to K_max+1 bonf_data = pooled_stats[['k','bonf']] bonf_data.loc[len(bonf_data.index)] = [K_min-1,0.05/(K_min-1)] bonf_data.loc[len(bonf_data.index)] = [K_max+1,0.05/(K_max+1)] bonf_data = bonf_data.sort_values(by='k').reset_index(drop=True) #plotting with plt.style.context('dark_background' if darkstyle else 'default'): plt.ion() plt.figure(figsize=(11,6)) plt.scatter(pooled_stats.k,pooled_stats.p,c=color_list,s=2) plt.axhline(0.05,linestyle="dashed",c='firebrick',label=r'$\alpha^{1}$ = 0.05') plt.plot( bonf_data.k, bonf_data.bonf, linestyle="dashed", c='mediumseagreen', label=r'$\alpha^{2}$ = $\alpha^{1}$ / k' ) plt.axhline( 0.05/alpha3, linestyle="dashed", c='royalblue', label=r'$\alpha^{3}$ = $\alpha^{1}$ / Σ (k)' ) #alpha 3 = Σk / 0.05 plt.yscale('log') plt.xlabel('Number of PL states in\neach clustering solution (K)',fontsize=15) plt.ylabel('Two-sided p-value',fontsize=15) plt.xticks(np.arange(K_min,K_max+1),[k for k in np.arange(K_min,K_max+1)]) plt.xlim(left=K_min-1,right=K_max+1) plt.ylim(bottom=0.00000001) if metric is not None: title = f"{str(metric).capitalize().replace('_',' ')}: {np.unique(pooled_stats.group_1)[0]} vs {np.unique(pooled_stats.group_2)[0]}" plt.title(title,fontsize=20,pad=20) plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') if fill_areas: plt.fill_between( np.arange(K_min-1,K_max+2), np.flip(np.unique(bonf_data.bonf)), 0.05/alpha3, alpha=0.15, color='mediumseagreen' ) plt.fill_between( np.arange(K_min-1,K_max+2), np.flip(np.unique(bonf_data.bonf)), 0.05, alpha=0.15, color='firebrick' ) plt.fill_between( np.arange(K_min-1,K_max+2), -1, 0.05/alpha3, alpha=0.15, color='royalblue' ) plt.tight_layout()