{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analytic Statistics"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Include common analytic statistical analyses\n",
"- Still, R is better for these tasks."
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import scipy.stats as stats\n",
"\n",
"DEMO_DATA_ROOT = \"../../../RepositoryData/data\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Kruskal Test\n",
"\n",
"- Two independent sample means"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CASE | \n",
" HEDGES | \n",
" SEX | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 17 | \n",
" F | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 17 | \n",
" F | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 17 | \n",
" F | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 17 | \n",
" F | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 16 | \n",
" F | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" CASE HEDGES SEX\n",
"0 1 17 F\n",
"1 2 17 F\n",
"2 3 17 F\n",
"3 4 17 F\n",
"4 5 16 F"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hedges = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-1-2-1_hedges.csv\")\n",
"hedges.head()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.4666666666666667 \n",
" 0.0025300622362698397\n"
]
}
],
"source": [
"u_statistic, p = stats.ks_2samp(hedges[hedges['SEX']==\"M\"]['HEDGES'],hedges[hedges['SEX']==\"F\"]['HEDGES'] )\n",
"print(u_statistic, '\\n', p)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chi-square"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 85, 65],\n",
" [100, 147]])"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = np.array([[85, 65],\n",
" [100,147]])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chi-square value = 9.82, df = 1.00, p = 0.00\n"
]
}
],
"source": [
"V, p, df, expected = stats.chi2_contingency(data, correction=False)\n",
"print(\"Chi-square value = %1.2f, df = %1.2f, p = %1.2f\"%(V, df, p))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## McNear Test\n",
"\n",
"- One dependent variable (categorical)\n",
"- dependent samples"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SENTENCE | \n",
" BEFORE | \n",
" AFTER | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" acceptable | \n",
" acceptable | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" acceptable | \n",
" acceptable | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" acceptable | \n",
" acceptable | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" acceptable | \n",
" acceptable | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" acceptable | \n",
" acceptable | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SENTENCE BEFORE AFTER\n",
"0 1 acceptable acceptable\n",
"1 2 acceptable acceptable\n",
"2 3 acceptable acceptable\n",
"3 4 acceptable acceptable\n",
"4 5 acceptable acceptable"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv\")\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chi-square=13.00, p = 0.00\n"
]
}
],
"source": [
"from statsmodels.sandbox.stats.runs import mcnemar\n",
"\n",
"crosstab = pd.crosstab(data['BEFORE'],data['AFTER'])\n",
"x2, p = mcnemar(crosstab, correction=False)\n",
"print('Chi-square=%1.2f, p = %1.2f'%(x2, p))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Independent *t*-test"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CASE | \n",
" HZ_F1 | \n",
" SEX | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 488.57 | \n",
" M | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 557.66 | \n",
" M | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 424.69 | \n",
" M | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 625.95 | \n",
" M | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 531.04 | \n",
" M | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" CASE HZ_F1 SEX\n",
"0 1 488.57 M\n",
"1 2 557.66 M\n",
"2 3 424.69 M\n",
"3 4 625.95 M\n",
"4 5 531.04 M"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vowels = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv\")\n",
"vowels.head()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"t-score=-2.44, p=0.02\n"
]
}
],
"source": [
"t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1'])\n",
"print(\"t-score=%1.2f, p=%1.2f\"%(t,p))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## One-way ANOVA"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CASE | \n",
" RT | \n",
" FREQUENCY | \n",
" FAMILIARITY | \n",
" IMAGEABILITY | \n",
" MEANINGFULNESS | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" almond | \n",
" 650.9947 | \n",
" 0.693147 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" ant | \n",
" 589.4347 | \n",
" 1.945910 | \n",
" med | \n",
" hi | \n",
" 415.0 | \n",
"
\n",
" \n",
" 2 | \n",
" apple | \n",
" 523.0493 | \n",
" 2.302585 | \n",
" hi | \n",
" hi | \n",
" 451.0 | \n",
"
\n",
" \n",
" 3 | \n",
" apricot | \n",
" 642.3342 | \n",
" 0.693147 | \n",
" lo | \n",
" lo | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" asparagus | \n",
" 696.2092 | \n",
" 0.693147 | \n",
" med | \n",
" lo | \n",
" 442.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 72 | \n",
" tortoise | \n",
" 733.0323 | \n",
" 1.386294 | \n",
" lo | \n",
" lo | \n",
" 403.0 | \n",
"
\n",
" \n",
" 73 | \n",
" walnut | \n",
" 663.5908 | \n",
" 2.484907 | \n",
" med | \n",
" lo | \n",
" 468.0 | \n",
"
\n",
" \n",
" 74 | \n",
" wasp | \n",
" 725.7056 | \n",
" 1.098612 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 75 | \n",
" whale | \n",
" 609.9745 | \n",
" 0.000000 | \n",
" med | \n",
" hi | \n",
" 474.0 | \n",
"
\n",
" \n",
" 76 | \n",
" woodpecker | \n",
" 686.3439 | \n",
" 0.693147 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
77 rows × 6 columns
\n",
"
"
],
"text/plain": [
" CASE RT FREQUENCY FAMILIARITY IMAGEABILITY MEANINGFULNESS\n",
"0 almond 650.9947 0.693147 NaN NaN NaN\n",
"1 ant 589.4347 1.945910 med hi 415.0\n",
"2 apple 523.0493 2.302585 hi hi 451.0\n",
"3 apricot 642.3342 0.693147 lo lo NaN\n",
"4 asparagus 696.2092 0.693147 med lo 442.0\n",
".. ... ... ... ... ... ...\n",
"72 tortoise 733.0323 1.386294 lo lo 403.0\n",
"73 walnut 663.5908 2.484907 med lo 468.0\n",
"74 wasp 725.7056 1.098612 NaN NaN NaN\n",
"75 whale 609.9745 0.000000 med hi 474.0\n",
"76 woodpecker 686.3439 0.693147 NaN NaN NaN\n",
"\n",
"[77 rows x 6 columns]"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/05-2_reactiontimes.csv\")\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"data = data.dropna()\n",
"\n",
"from statsmodels.formula.api import ols\n",
"from statsmodels.stats.anova import anova_lm"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" df sum_sq mean_sq F PR(>F)\n",
"FAMILIARITY 2.0 16377.601187 8188.800594 4.645348 0.014654\n",
"Residual 45.0 79325.813100 1762.795847 NaN NaN\n"
]
}
],
"source": [
"model = ols('RT ~ FAMILIARITY', data).fit()\n",
"aov = anova_lm(model)\n",
"print(aov)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python-notes",
"language": "python",
"name": "python-notes"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": false,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}