Analytic Statistics#

  • Include common analytic statistical analyses

  • Still, R is better for these tasks.

import numpy as np
import pandas as pd
import scipy.stats as stats

DEMO_DATA_ROOT = "../../../RepositoryData/data"

Kruskal Test#

  • Two independent sample means

hedges = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-1-2-1_hedges.csv")
hedges.head()
CASE HEDGES SEX
0 1 17 F
1 2 17 F
2 3 17 F
3 4 17 F
4 5 16 F
u_statistic, p = stats.ks_2samp(hedges[hedges['SEX']=="M"]['HEDGES'],hedges[hedges['SEX']=="F"]['HEDGES'] )
print(u_statistic, '\n', p)
0.4666666666666667 
 0.0025300622362698397

Chi-square#

data = np.array([[85, 65],
                 [100,147]])
data
array([[ 85,  65],
       [100, 147]])
V, p, df, expected = stats.chi2_contingency(data, correction=False)
print("Chi-square value = %1.2f, df = %1.2f, p = %1.2f"%(V, df, p))
Chi-square value = 9.82, df = 1.00, p = 0.00

McNear Test#

  • One dependent variable (categorical)

  • dependent samples

data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv")
data.head()
SENTENCE BEFORE AFTER
0 1 acceptable acceptable
1 2 acceptable acceptable
2 3 acceptable acceptable
3 4 acceptable acceptable
4 5 acceptable acceptable
from statsmodels.sandbox.stats.runs import mcnemar

crosstab = pd.crosstab(data['BEFORE'],data['AFTER'])
x2, p = mcnemar(crosstab, correction=False)
print('Chi-square=%1.2f, p = %1.2f'%(x2, p))
Chi-square=13.00, p = 0.00

Independent t-test#

vowels = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv")
vowels.head()
CASE HZ_F1 SEX
0 1 488.57 M
1 2 557.66 M
2 3 424.69 M
3 4 625.95 M
4 5 531.04 M
t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1'])
print("t-score=%1.2f, p=%1.2f"%(t,p))
t-score=-2.44, p=0.02

One-way ANOVA#

data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/05-2_reactiontimes.csv")
data
CASE RT FREQUENCY FAMILIARITY IMAGEABILITY MEANINGFULNESS
0 almond 650.9947 0.693147 NaN NaN NaN
1 ant 589.4347 1.945910 med hi 415.0
2 apple 523.0493 2.302585 hi hi 451.0
3 apricot 642.3342 0.693147 lo lo NaN
4 asparagus 696.2092 0.693147 med lo 442.0
... ... ... ... ... ... ...
72 tortoise 733.0323 1.386294 lo lo 403.0
73 walnut 663.5908 2.484907 med lo 468.0
74 wasp 725.7056 1.098612 NaN NaN NaN
75 whale 609.9745 0.000000 med hi 474.0
76 woodpecker 686.3439 0.693147 NaN NaN NaN

77 rows × 6 columns

data = data.dropna()

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model = ols('RT ~ FAMILIARITY', data).fit()
aov = anova_lm(model)
print(aov)
               df        sum_sq      mean_sq         F    PR(>F)
FAMILIARITY   2.0  16377.601187  8188.800594  4.645348  0.014654
Residual     45.0  79325.813100  1762.795847       NaN       NaN