{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analytic Statistics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Include common analytic statistical analyses\n", "- Still, R is better for these tasks." ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import scipy.stats as stats\n", "\n", "DEMO_DATA_ROOT = \"../../../RepositoryData/data\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Kruskal Test\n", "\n", "- Two independent sample means" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CASEHEDGESSEX
0117F
1217F
2317F
3417F
4516F
\n", "
" ], "text/plain": [ " CASE HEDGES SEX\n", "0 1 17 F\n", "1 2 17 F\n", "2 3 17 F\n", "3 4 17 F\n", "4 5 16 F" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hedges = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-1-2-1_hedges.csv\")\n", "hedges.head()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.4666666666666667 \n", " 0.0025300622362698397\n" ] } ], "source": [ "u_statistic, p = stats.ks_2samp(hedges[hedges['SEX']==\"M\"]['HEDGES'],hedges[hedges['SEX']==\"F\"]['HEDGES'] )\n", "print(u_statistic, '\\n', p)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chi-square" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 85, 65],\n", " [100, 147]])" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = np.array([[85, 65],\n", " [100,147]])\n", "data" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Chi-square value = 9.82, df = 1.00, p = 0.00\n" ] } ], "source": [ "V, p, df, expected = stats.chi2_contingency(data, correction=False)\n", "print(\"Chi-square value = %1.2f, df = %1.2f, p = %1.2f\"%(V, df, p))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## McNear Test\n", "\n", "- One dependent variable (categorical)\n", "- dependent samples" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SENTENCEBEFOREAFTER
01acceptableacceptable
12acceptableacceptable
23acceptableacceptable
34acceptableacceptable
45acceptableacceptable
\n", "
" ], "text/plain": [ " SENTENCE BEFORE AFTER\n", "0 1 acceptable acceptable\n", "1 2 acceptable acceptable\n", "2 3 acceptable acceptable\n", "3 4 acceptable acceptable\n", "4 5 acceptable acceptable" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv\")\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Chi-square=13.00, p = 0.00\n" ] } ], "source": [ "from statsmodels.sandbox.stats.runs import mcnemar\n", "\n", "crosstab = pd.crosstab(data['BEFORE'],data['AFTER'])\n", "x2, p = mcnemar(crosstab, correction=False)\n", "print('Chi-square=%1.2f, p = %1.2f'%(x2, p))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Independent *t*-test" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CASEHZ_F1SEX
01488.57M
12557.66M
23424.69M
34625.95M
45531.04M
\n", "
" ], "text/plain": [ " CASE HZ_F1 SEX\n", "0 1 488.57 M\n", "1 2 557.66 M\n", "2 3 424.69 M\n", "3 4 625.95 M\n", "4 5 531.04 M" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vowels = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv\")\n", "vowels.head()" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "t-score=-2.44, p=0.02\n" ] } ], "source": [ "t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1'])\n", "print(\"t-score=%1.2f, p=%1.2f\"%(t,p))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## One-way ANOVA" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CASERTFREQUENCYFAMILIARITYIMAGEABILITYMEANINGFULNESS
0almond650.99470.693147NaNNaNNaN
1ant589.43471.945910medhi415.0
2apple523.04932.302585hihi451.0
3apricot642.33420.693147loloNaN
4asparagus696.20920.693147medlo442.0
.....................
72tortoise733.03231.386294lolo403.0
73walnut663.59082.484907medlo468.0
74wasp725.70561.098612NaNNaNNaN
75whale609.97450.000000medhi474.0
76woodpecker686.34390.693147NaNNaNNaN
\n", "

77 rows × 6 columns

\n", "
" ], "text/plain": [ " CASE RT FREQUENCY FAMILIARITY IMAGEABILITY MEANINGFULNESS\n", "0 almond 650.9947 0.693147 NaN NaN NaN\n", "1 ant 589.4347 1.945910 med hi 415.0\n", "2 apple 523.0493 2.302585 hi hi 451.0\n", "3 apricot 642.3342 0.693147 lo lo NaN\n", "4 asparagus 696.2092 0.693147 med lo 442.0\n", ".. ... ... ... ... ... ...\n", "72 tortoise 733.0323 1.386294 lo lo 403.0\n", "73 walnut 663.5908 2.484907 med lo 468.0\n", "74 wasp 725.7056 1.098612 NaN NaN NaN\n", "75 whale 609.9745 0.000000 med hi 474.0\n", "76 woodpecker 686.3439 0.693147 NaN NaN NaN\n", "\n", "[77 rows x 6 columns]" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/05-2_reactiontimes.csv\")\n", "data" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "data = data.dropna()\n", "\n", "from statsmodels.formula.api import ols\n", "from statsmodels.stats.anova import anova_lm" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " df sum_sq mean_sq F PR(>F)\n", "FAMILIARITY 2.0 16377.601187 8188.800594 4.645348 0.014654\n", "Residual 45.0 79325.813100 1762.795847 NaN NaN\n" ] } ], "source": [ "model = ols('RT ~ FAMILIARITY', data).fit()\n", "aov = anova_lm(model)\n", "print(aov)" ] } ], "metadata": { "kernelspec": { "display_name": "python-notes", "language": "python", "name": "python-notes" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }