{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analytic Statistics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Include common analytic statistical analyses\n",
    "- Still, R is better for these tasks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy.stats as stats\n",
    "\n",
    "DEMO_DATA_ROOT = \"../../../RepositoryData/data\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Kruskal Test\n",
    "\n",
    "- Two independent sample means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CASE</th>\n",
       "      <th>HEDGES</th>\n",
       "      <th>SEX</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>17</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>17</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>16</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   CASE  HEDGES SEX\n",
       "0     1      17   F\n",
       "1     2      17   F\n",
       "2     3      17   F\n",
       "3     4      17   F\n",
       "4     5      16   F"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hedges = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-1-2-1_hedges.csv\")\n",
    "hedges.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4666666666666667 \n",
      " 0.0025300622362698397\n"
     ]
    }
   ],
   "source": [
    "u_statistic, p = stats.ks_2samp(hedges[hedges['SEX']==\"M\"]['HEDGES'],hedges[hedges['SEX']==\"F\"]['HEDGES'] )\n",
    "print(u_statistic, '\\n', p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chi-square"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 85,  65],\n",
       "       [100, 147]])"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = np.array([[85, 65],\n",
    "                 [100,147]])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Chi-square value = 9.82, df = 1.00, p = 0.00\n"
     ]
    }
   ],
   "source": [
    "V, p, df, expected = stats.chi2_contingency(data, correction=False)\n",
    "print(\"Chi-square value = %1.2f, df = %1.2f, p = %1.2f\"%(V, df, p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## McNear Test\n",
    "\n",
    "- One dependent variable (categorical)\n",
    "- dependent samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SENTENCE</th>\n",
       "      <th>BEFORE</th>\n",
       "      <th>AFTER</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>acceptable</td>\n",
       "      <td>acceptable</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>acceptable</td>\n",
       "      <td>acceptable</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>acceptable</td>\n",
       "      <td>acceptable</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>acceptable</td>\n",
       "      <td>acceptable</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>acceptable</td>\n",
       "      <td>acceptable</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   SENTENCE      BEFORE       AFTER\n",
       "0         1  acceptable  acceptable\n",
       "1         2  acceptable  acceptable\n",
       "2         3  acceptable  acceptable\n",
       "3         4  acceptable  acceptable\n",
       "4         5  acceptable  acceptable"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Chi-square=13.00, p = 0.00\n"
     ]
    }
   ],
   "source": [
    "from statsmodels.sandbox.stats.runs import mcnemar\n",
    "\n",
    "crosstab = pd.crosstab(data['BEFORE'],data['AFTER'])\n",
    "x2, p = mcnemar(crosstab, correction=False)\n",
    "print('Chi-square=%1.2f, p = %1.2f'%(x2, p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Independent *t*-test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CASE</th>\n",
       "      <th>HZ_F1</th>\n",
       "      <th>SEX</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>488.57</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>557.66</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>424.69</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>625.95</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>531.04</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   CASE   HZ_F1 SEX\n",
       "0     1  488.57   M\n",
       "1     2  557.66   M\n",
       "2     3  424.69   M\n",
       "3     4  625.95   M\n",
       "4     5  531.04   M"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vowels = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv\")\n",
    "vowels.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "t-score=-2.44, p=0.02\n"
     ]
    }
   ],
   "source": [
    "t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1'])\n",
    "print(\"t-score=%1.2f, p=%1.2f\"%(t,p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## One-way ANOVA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CASE</th>\n",
       "      <th>RT</th>\n",
       "      <th>FREQUENCY</th>\n",
       "      <th>FAMILIARITY</th>\n",
       "      <th>IMAGEABILITY</th>\n",
       "      <th>MEANINGFULNESS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>almond</td>\n",
       "      <td>650.9947</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ant</td>\n",
       "      <td>589.4347</td>\n",
       "      <td>1.945910</td>\n",
       "      <td>med</td>\n",
       "      <td>hi</td>\n",
       "      <td>415.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>apple</td>\n",
       "      <td>523.0493</td>\n",
       "      <td>2.302585</td>\n",
       "      <td>hi</td>\n",
       "      <td>hi</td>\n",
       "      <td>451.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>apricot</td>\n",
       "      <td>642.3342</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>lo</td>\n",
       "      <td>lo</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>asparagus</td>\n",
       "      <td>696.2092</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>med</td>\n",
       "      <td>lo</td>\n",
       "      <td>442.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>tortoise</td>\n",
       "      <td>733.0323</td>\n",
       "      <td>1.386294</td>\n",
       "      <td>lo</td>\n",
       "      <td>lo</td>\n",
       "      <td>403.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>walnut</td>\n",
       "      <td>663.5908</td>\n",
       "      <td>2.484907</td>\n",
       "      <td>med</td>\n",
       "      <td>lo</td>\n",
       "      <td>468.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>wasp</td>\n",
       "      <td>725.7056</td>\n",
       "      <td>1.098612</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>whale</td>\n",
       "      <td>609.9745</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>med</td>\n",
       "      <td>hi</td>\n",
       "      <td>474.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>woodpecker</td>\n",
       "      <td>686.3439</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>77 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          CASE        RT  FREQUENCY FAMILIARITY IMAGEABILITY  MEANINGFULNESS\n",
       "0       almond  650.9947   0.693147         NaN          NaN             NaN\n",
       "1          ant  589.4347   1.945910         med           hi           415.0\n",
       "2        apple  523.0493   2.302585          hi           hi           451.0\n",
       "3      apricot  642.3342   0.693147          lo           lo             NaN\n",
       "4    asparagus  696.2092   0.693147         med           lo           442.0\n",
       "..         ...       ...        ...         ...          ...             ...\n",
       "72    tortoise  733.0323   1.386294          lo           lo           403.0\n",
       "73      walnut  663.5908   2.484907         med           lo           468.0\n",
       "74        wasp  725.7056   1.098612         NaN          NaN             NaN\n",
       "75       whale  609.9745   0.000000         med           hi           474.0\n",
       "76  woodpecker  686.3439   0.693147         NaN          NaN             NaN\n",
       "\n",
       "[77 rows x 6 columns]"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_table(DEMO_DATA_ROOT + \"/gries_sflwr/_inputfiles/05-2_reactiontimes.csv\")\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.dropna()\n",
    "\n",
    "from statsmodels.formula.api import ols\n",
    "from statsmodels.stats.anova import anova_lm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               df        sum_sq      mean_sq         F    PR(>F)\n",
      "FAMILIARITY   2.0  16377.601187  8188.800594  4.645348  0.014654\n",
      "Residual     45.0  79325.813100  1762.795847       NaN       NaN\n"
     ]
    }
   ],
   "source": [
    "model = ols('RT ~ FAMILIARITY', data).fit()\n",
    "aov = anova_lm(model)\n",
    "print(aov)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python-notes",
   "language": "python",
   "name": "python-notes"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}