Descriptive Statistics#

DEMO_DATA_ROOT = "../../../RepositoryData/data"
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Univariate Statistics#

UHM = pd.read_table(DEMO_DATA_ROOT+"/gries_sflwr/_inputfiles/03-1_uh(m).csv")
UHM
CASE SEX FILLER GENRE LENGTH
0 1 male uhm monolog 1014
1 2 female silence monolog 1188
2 3 female silence dialog 889
3 4 female uhm dialog 265
4 5 male uh dialog 465
... ... ... ... ... ...
995 996 male silence dialog 751
996 997 female uhm dialog 1005
997 998 female uhm monolog 568
998 999 female uh dialog 984
999 1000 female uh dialog 1521

1000 rows × 5 columns

UHM.value_counts(UHM['FILLER'])
FILLER
uh         394
silence    332
uhm        274
dtype: int64
UHM.value_counts(UHM['FILLER'], normalize=True)
FILLER
uh         0.394
silence    0.332
uhm        0.274
dtype: float64
def ecdf(data):
    x = np.sort(data)
    y = np.arange(1, len(x)+1)/len(x) # percentiles
    return(x,y)

ecdf(UHM.value_counts(UHM['FILLER']))
(array([274, 332, 394]), array([0.33333333, 0.66666667, 1.        ]))