Analytic Statistics#
Include common analytic statistical analyses
Still, R is better for these tasks.
import numpy as np
import pandas as pd
import scipy.stats as stats
DEMO_DATA_ROOT = "../../../RepositoryData/data"
Kruskal Test#
Two independent sample means
hedges = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-1-2-1_hedges.csv")
hedges.head()
CASE | HEDGES | SEX | |
---|---|---|---|
0 | 1 | 17 | F |
1 | 2 | 17 | F |
2 | 3 | 17 | F |
3 | 4 | 17 | F |
4 | 5 | 16 | F |
u_statistic, p = stats.ks_2samp(hedges[hedges['SEX']=="M"]['HEDGES'],hedges[hedges['SEX']=="F"]['HEDGES'] )
print(u_statistic, '\n', p)
0.4666666666666667
0.0025300622362698397
Chi-square#
data = np.array([[85, 65],
[100,147]])
data
array([[ 85, 65],
[100, 147]])
V, p, df, expected = stats.chi2_contingency(data, correction=False)
print("Chi-square value = %1.2f, df = %1.2f, p = %1.2f"%(V, df, p))
Chi-square value = 9.82, df = 1.00, p = 0.00
McNear Test#
One dependent variable (categorical)
dependent samples
data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv")
data.head()
SENTENCE | BEFORE | AFTER | |
---|---|---|---|
0 | 1 | acceptable | acceptable |
1 | 2 | acceptable | acceptable |
2 | 3 | acceptable | acceptable |
3 | 4 | acceptable | acceptable |
4 | 5 | acceptable | acceptable |
from statsmodels.sandbox.stats.runs import mcnemar
crosstab = pd.crosstab(data['BEFORE'],data['AFTER'])
x2, p = mcnemar(crosstab, correction=False)
print('Chi-square=%1.2f, p = %1.2f'%(x2, p))
Chi-square=13.00, p = 0.00
Independent t-test#
vowels = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv")
vowels.head()
CASE | HZ_F1 | SEX | |
---|---|---|---|
0 | 1 | 488.57 | M |
1 | 2 | 557.66 | M |
2 | 3 | 424.69 | M |
3 | 4 | 625.95 | M |
4 | 5 | 531.04 | M |
t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1'])
print("t-score=%1.2f, p=%1.2f"%(t,p))
t-score=-2.44, p=0.02
One-way ANOVA#
data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/05-2_reactiontimes.csv")
data
CASE | RT | FREQUENCY | FAMILIARITY | IMAGEABILITY | MEANINGFULNESS | |
---|---|---|---|---|---|---|
0 | almond | 650.9947 | 0.693147 | NaN | NaN | NaN |
1 | ant | 589.4347 | 1.945910 | med | hi | 415.0 |
2 | apple | 523.0493 | 2.302585 | hi | hi | 451.0 |
3 | apricot | 642.3342 | 0.693147 | lo | lo | NaN |
4 | asparagus | 696.2092 | 0.693147 | med | lo | 442.0 |
... | ... | ... | ... | ... | ... | ... |
72 | tortoise | 733.0323 | 1.386294 | lo | lo | 403.0 |
73 | walnut | 663.5908 | 2.484907 | med | lo | 468.0 |
74 | wasp | 725.7056 | 1.098612 | NaN | NaN | NaN |
75 | whale | 609.9745 | 0.000000 | med | hi | 474.0 |
76 | woodpecker | 686.3439 | 0.693147 | NaN | NaN | NaN |
77 rows × 6 columns
data = data.dropna()
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model = ols('RT ~ FAMILIARITY', data).fit()
aov = anova_lm(model)
print(aov)
df sum_sq mean_sq F PR(>F)
FAMILIARITY 2.0 16377.601187 8188.800594 4.645348 0.014654
Residual 45.0 79325.813100 1762.795847 NaN NaN