Data Visualization I#

Preparing Datasets#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib


# Make a data frame
df=pd.DataFrame({'x': range(1,11), 'y1': np.random.randn(10), 'y2': np.random.randn(10)+range(1,11), 'y3': np.random.randn(10)+range(11,21), 'y4': np.random.randn(10)+range(6,16), 'y5': np.random.randn(10)+range(4,14)+(0,0,0,0,0,0,0,-3,-8,-6), 'y6': np.random.randn(10)+range(2,12), 'y7': np.random.randn(10)+range(5,15), 'y8': np.random.randn(10)+range(4,14), 'y9': np.random.randn(10)+range(4,14), 'y10': np.random.randn(10)+range(2,12) })

df['x']=pd.Categorical(df['x'])
print(df.dtypes)
df.head(10)
x      category
y1      float64
y2      float64
y3      float64
y4      float64
y5      float64
y6      float64
y7      float64
y8      float64
y9      float64
y10     float64
dtype: object
x y1 y2 y3 y4 y5 y6 y7 y8 y9 y10
0 1 0.953949 2.878138 8.866186 7.169483 4.894806 1.383657 5.207700 2.749603 3.777047 4.338875
1 2 0.148130 2.182593 11.399598 5.671996 5.013517 2.503236 5.270997 3.332785 4.707431 3.470963
2 3 0.401601 3.342006 12.787722 7.898548 6.005761 5.079653 7.948136 7.321007 5.849269 3.589841
3 4 1.325920 4.496024 15.672599 9.664751 6.800054 3.797002 8.461314 7.364939 7.597704 4.230760
4 5 0.045468 4.553081 14.898799 10.109224 6.771789 6.317487 9.446912 9.109398 7.054921 5.494813
5 6 0.385796 6.015589 15.090889 10.442972 9.406430 5.064227 10.006732 6.563899 12.320422 7.353816
6 7 0.316242 5.671259 17.911464 11.287174 9.375809 7.951598 9.014010 11.649927 11.138327 7.948010
7 8 1.639019 7.694177 19.323432 12.289224 7.487505 8.626360 12.696997 11.324753 9.214215 8.570674
8 9 -0.886327 10.187601 18.373330 14.499094 4.679189 10.825828 13.044199 10.468501 11.931833 9.631274
9 10 -0.261461 8.573481 19.369957 13.711514 7.270583 11.427754 13.084936 13.267057 13.893086 11.912090

Matlibplot#

Resolution#

  • We can increase the dpi of the matplotlib parameters to get image of higher resolution in notebook

  • The dpi setting has to GO before the magic inline command because the magic inline commened resets the dpi to default

## Change DPI for higher resolution in notebook
%matplotlib inline

matplotlib.rcParams['figure.dpi'] = 150
matplotlib.rcParams['savefig.dpi'] = 150
# Change DPI when saving graphs in files
# matplotlib.rc("savefig", dpi=dpi)

Matplotlib Style#

# available style
print(plt.style.available)
['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
# choose one style
plt.style.use('fivethirtyeight')

Matplotlib Chinese Issues#

## Setting Chinese Fonts
## Permanent Setting Version
plt.rcParams['font.sans-serif']=["PingFang HK"]
plt.rcParams['axes.unicode_minus']= False

Plotting#

## Simple X and Y
plt.plot(df['x'], df['y1'])
plt.show()
../_images/a1b695521e7fb062bd34142549cf763c1d0de879bf8d4dacd981011010a69b5a.png
## Simple X and two Y's
plt.plot(df['x'], df['y1'])
plt.plot(df['x'], df['y2'])
plt.show()
../_images/ca5d4536bb2f2ec166b4ad589b3cb0928d87e9ee63b577e2f967f5987fbfa263.png
## Adding legends
## Simple X and two Y's
plt.plot(df['x'], df['y1'], label="資料1")
plt.plot(df['x'], df['y2'], label="資料2")
plt.legend()
plt.tight_layout()
plt.show()
../_images/77d8093cb08b982fcbf6455a7763061de79e828cd9b24dbf5bd2345241d719b5.png
## Save graphs
## Simple X and two Y's
plt.plot(df['x'], df['y1'], label="資料1")
plt.plot(df['x'], df['y2'], label="資料2")
plt.legend()
plt.tight_layout()
plt.savefig('plot.png')
plt.show()
../_images/77d8093cb08b982fcbf6455a7763061de79e828cd9b24dbf5bd2345241d719b5.png
## Add x/y labels and title
## Simple X and two Y's
plt.plot(df['x'], df['y1'], label="資料1")
plt.plot(df['x'], df['y2'], label="資料2")
plt.legend()
plt.xlabel("X軸")
plt.ylabel("Y軸")
plt.title("漂亮的圖")
plt.tight_layout()
plt.show()
../_images/30f727790eb7e5b540f249a0cc04edbcb92ea76d292af0103d0334128e577a2d.png

Bar Plots#

## Normal bar plot
plt.bar(df['x'], df['y3'])
plt.show()
../_images/e3b01bf5f8f58996ab4250091eb2c3ed861484629ada02b1db7614cb9ae723b4.png
## Sort bars according to values

df_sorted = df.sort_values(['y3','y2'], ascending=True)
print(df_sorted.dtypes)
df_sorted
x      category
y1      float64
y2      float64
y3      float64
y4      float64
y5      float64
y6      float64
y7      float64
y8      float64
y9      float64
y10     float64
dtype: object
x y1 y2 y3 y4 y5 y6 y7 y8 y9 y10
0 1 0.953949 2.878138 8.866186 7.169483 4.894806 1.383657 5.207700 2.749603 3.777047 4.338875
1 2 0.148130 2.182593 11.399598 5.671996 5.013517 2.503236 5.270997 3.332785 4.707431 3.470963
2 3 0.401601 3.342006 12.787722 7.898548 6.005761 5.079653 7.948136 7.321007 5.849269 3.589841
4 5 0.045468 4.553081 14.898799 10.109224 6.771789 6.317487 9.446912 9.109398 7.054921 5.494813
5 6 0.385796 6.015589 15.090889 10.442972 9.406430 5.064227 10.006732 6.563899 12.320422 7.353816
3 4 1.325920 4.496024 15.672599 9.664751 6.800054 3.797002 8.461314 7.364939 7.597704 4.230760
6 7 0.316242 5.671259 17.911464 11.287174 9.375809 7.951598 9.014010 11.649927 11.138327 7.948010
8 9 -0.886327 10.187601 18.373330 14.499094 4.679189 10.825828 13.044199 10.468501 11.931833 9.631274
7 8 1.639019 7.694177 19.323432 12.289224 7.487505 8.626360 12.696997 11.324753 9.214215 8.570674
9 10 -0.261461 8.573481 19.369957 13.711514 7.270583 11.427754 13.084936 13.267057 13.893086 11.912090
plt.bar('x', 'y3', data=df_sorted)
plt.show()
../_images/e3b01bf5f8f58996ab4250091eb2c3ed861484629ada02b1db7614cb9ae723b4.png
## Horizontal Bars
plt.bar('x', 'y4', data=df.sort_values('y4'))
plt.tight_layout()
plt.show()
../_images/bbb157de64ef0e399c4fbb0de1837ae24bafecf5b1347f9e4bc72197e31af63d.png

Pie Chart#

plt.style.use("fivethirtyeight")

slices = [20, 30, 30, 20]
labels = ['Attendance', 'Midterm', 'Final', 'Assignments']
explode = [0, 0, 0.1, 0]

plt.pie(slices, labels=labels, explode=explode, shadow=True,
        startangle=90, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})

plt.title("Grading Policy")
plt.tight_layout()
plt.show()
../_images/04e0cc7164437ec3cdfb8159dd623a96c429534ecc224774971dbcb2d59e3898.png

Stacked Plot#

plt.style.use("fivethirtyeight")


minutes = [1, 2, 3, 4, 5, 6, 7, 8, 9]

player1 = [1, 2, 3, 3, 4, 4, 4, 4, 5]
player2 = [1, 1, 1, 1, 2, 2, 2, 3, 4]
player3 = [1, 1, 1, 2, 2, 2, 3, 3, 3]

labels = ['player1', 'player2', 'player3']
colors = ['#6d904f', '#fc4f30', '#008fd5']

plt.stackplot(minutes, player1, player2, player3, labels=labels, colors=colors)

plt.legend(loc='upper left')

plt.title("Stacked Plot")
plt.tight_layout()
plt.show()
../_images/bc9c99e62c7dd30a8105421f5406ca1c01170083c4f4e99dd2343f61da8657cb.png

Histogram#

import random
import numpy as np

# grades = [random.randint(0,100) for i in range(1000)]

grades = np.random.normal(85, 13, 10000)
#bins = [50, 60, 70, 80, 90,100]

plt.hist(grades, bins= 50,edgecolor='black')

reference_line = np.mean(grades)
color = '#fc4f30'

plt.axvline(reference_line, color=color, label='Mean Score', linewidth=2)

plt.legend()

plt.title('Histogram')
plt.xlabel('Student Grades')
plt.ylabel('個數')

plt.tight_layout()

plt.show()
../_images/8e683fd0da6152de8313f7e8cd3ab1ddb7790f7db1e23430fe87a316345b0cfe.png

Scatter Plot#

plt.figure(figsize=(7,5), dpi=300)
plt.scatter(df['y4'], df['y5'], c=df['y6'], s=df['y6']*100,cmap='summer',
            edgecolor='black', linewidth=1, alpha=0.75)

cbar = plt.colorbar()
cbar.set_label('Y6數值大小')

# plt.xscale('log')
# plt.yscale('log')

plt.title('3D Scatterplot')
plt.xlabel('Y1')
plt.ylabel('Y2')

plt.ylim((0,13))

plt.show()
../_images/397ccd635fe7897841a11e7dd4b156f89978373ca0f416af934c6fc79a2935fc.png

Complex Graphs#

df
x y1 y2 y3 y4 y5 y6 y7 y8 y9 y10
0 1 0.953949 2.878138 8.866186 7.169483 4.894806 1.383657 5.207700 2.749603 3.777047 4.338875
1 2 0.148130 2.182593 11.399598 5.671996 5.013517 2.503236 5.270997 3.332785 4.707431 3.470963
2 3 0.401601 3.342006 12.787722 7.898548 6.005761 5.079653 7.948136 7.321007 5.849269 3.589841
3 4 1.325920 4.496024 15.672599 9.664751 6.800054 3.797002 8.461314 7.364939 7.597704 4.230760
4 5 0.045468 4.553081 14.898799 10.109224 6.771789 6.317487 9.446912 9.109398 7.054921 5.494813
5 6 0.385796 6.015589 15.090889 10.442972 9.406430 5.064227 10.006732 6.563899 12.320422 7.353816
6 7 0.316242 5.671259 17.911464 11.287174 9.375809 7.951598 9.014010 11.649927 11.138327 7.948010
7 8 1.639019 7.694177 19.323432 12.289224 7.487505 8.626360 12.696997 11.324753 9.214215 8.570674
8 9 -0.886327 10.187601 18.373330 14.499094 4.679189 10.825828 13.044199 10.468501 11.931833 9.631274
9 10 -0.261461 8.573481 19.369957 13.711514 7.270583 11.427754 13.084936 13.267057 13.893086 11.912090
# create a color palette
palette = plt.get_cmap('Set1')
 
# multiple line plot
num=0
#plt.figure(figsize=(5,3), dpi=150)
for column in df.drop(['x','y10'], axis=1):
    num+=1
    plt.plot(df['x'], df[column], marker='', color=palette(num), linewidth=1, alpha=0.9, label=column)
 
plt.legend(loc=2, ncol=2)
 
# Add titles
plt.title("Line Plot With Several Values", loc='left', fontsize=12, fontweight=0, color='orange')
plt.xlabel("Time")
plt.ylabel("Score")
Text(0, 0.5, 'Score')
../_images/842fa994e967f0ce03e1bcbe5833446b50ebf597df42be446c8d42f7489ab369.png
import matplotlib.pyplot as plt
plt.style.use('ggplot')

df_columns = df.drop(['x','y1'], axis=1).columns
palette = plt.get_cmap('tab20')

num=0

fig, ax = plt.subplots(nrows=3, ncols=3, sharex=True, sharey=True, figsize=(8, 6), dpi=100)

for row in range(3):
    for col in range(3):
        ax[row,col].plot(df['x'],df[df_columns[num]],color=palette(num), linewidth=1.9, alpha=0.9, label=df_columns[num])
        ax[row,col].set_title(df_columns[num],loc='left', fontsize=14,color=palette(num))
        num+=1
fig.suptitle("Facet Grids", fontsize=14, fontweight=0, color='black', style='italic', y=1.02)
fig.text(0.5, 0.0, 'Common X', ha='center', fontsize=14)
fig.text(0.0, 0.5, 'Common Y', va='center', rotation='vertical', fontsize=14)
Text(0.0, 0.5, 'Common Y')
../_images/18e18be816e0ac1adfdd780f585ec29e1a664b93d083318f7ca2e15d69e3cb23.png

Seaborn Module#

Two Types of Functions#

  • Figure-level functions (Generic)

  • Axex-level functions (Specific)

## Change the DPI

import seaborn as sns
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set_context('notebook')
sns.set_style("ticks")
sns.set(style='darkgrid')
penguins = sns.load_dataset('penguins')
penguins.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
# histogram
print(sns.__version__) # seaborn>=0.11.0
sns.displot(data=penguins, x="flipper_length_mm", hue="species", multiple="stack")
0.11.0
<seaborn.axisgrid.FacetGrid at 0x7fb050c132e8>
../_images/7fe6368edd235d01d4d889ffebc24978abc6de71531a58058df55b61dad16114.png
sns.displot(data=penguins, x="flipper_length_mm", hue="species", multiple="stack")
<seaborn.axisgrid.FacetGrid at 0x7fb040877860>
../_images/7fe6368edd235d01d4d889ffebc24978abc6de71531a58058df55b61dad16114.png
sns.displot(data=penguins, x="flipper_length_mm", hue="species", col="species")
<seaborn.axisgrid.FacetGrid at 0x7fb050bf5eb8>
../_images/bd815c1a1e8231df501915afeb139969f9c1a5555ae8d4b23e5a90124c8b9e1f.png
## kernel density plot
sns.kdeplot(data=penguins, x='flipper_length_mm', hue='species', multiple="stack")
<AxesSubplot:xlabel='flipper_length_mm', ylabel='Density'>
../_images/78acbbb82d4cb2f9f41c8e9bd7dee9f86072d8064e7f42b4c5abc3586441e9cf.png
sns.displot(data=penguins, x="flipper_length_mm", hue="species", multiple="stack", kind="kde")
<seaborn.axisgrid.FacetGrid at 0x7fb050a358d0>
../_images/3c1f477bf1acb2fb624222989adf6d843ffda9594ad42b953870804b26270d04.png
tips = sns.load_dataset("tips")

g = sns.relplot(data=tips, x="total_bill", y="tip")
g.ax.axline(xy1=(10,2), slope=.2, color="b", dashes=(5,2))
<matplotlib.lines._AxLine at 0x7fb032fe64e0>
../_images/0aa657f5fcbf3f3ff0b84110c5b4bdcf2f8cb5e51239aa1cde04517b00a56995.png
g = sns.relplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", col="sex")
g.set_axis_labels("Flipper length (mm)", "Bill length (mm)")
<seaborn.axisgrid.FacetGrid at 0x7fb0209e5c50>
../_images/e2be8e9441510f8e6e306f003461dcf73295ca7114e0fc2c8b0a3358f40f80d5.png
sns.catplot(data=penguins, x='species', y='flipper_length_mm', kind="box")
<seaborn.axisgrid.FacetGrid at 0x7fb032f99198>
../_images/eca71d3f96131909ede249cd786fe301c18eff15dabe33fe8936db0c8feb172f.png
  • jointplot(): plots the relationship or joint distribution of two variables while adding marginal axes that show the univariate distribution of each one separately

sns.jointplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", hue="species")
<seaborn.axisgrid.JointGrid at 0x7fb020a01710>
../_images/3c4159b607a998fc5b732a00dde223226641ab2bed61ebe733361ecb71cacdee.png
  • pairplot(): visualizes every pairwise combination of variables simultaneously in a data frame

sns.pairplot(data=penguins, hue="species")
<seaborn.axisgrid.PairGrid at 0x7fb0408cb2b0>
../_images/22a4f37b79db0b5216b0e7b8c19a01f9aa987deb751b74caf687d2adb2685f85.png

Long-format vs. Wide-format Data#

flights = sns.load_dataset("flights")
flights.head()
year month passengers
0 1949 Jan 112
1 1949 Feb 118
2 1949 Mar 132
3 1949 Apr 129
4 1949 May 121
sns.relplot(data=flights, x="year", y="passengers", hue="month", kind="line")
<seaborn.axisgrid.FacetGrid at 0x7fb032dd7630>
../_images/35fd55907b2dcfaf3fa60be6cda75ea7e953517b5b608b9b82d4932c340cf661.png
sns.relplot(data=flights, x="month", y="passengers", hue="year", kind="line")
<seaborn.axisgrid.FacetGrid at 0x7fb040fe3438>
../_images/a84c0bbc3fd8f470569ba924609e98081586842d230e23cddd9199638995ba58.png
flights_wide = flights.pivot(index="year", columns="month", values="passengers")
flights_wide.head()
month Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
year
1949 112 118 132 129 121 135 148 148 136 119 104 118
1950 115 126 141 135 125 149 170 170 158 133 114 140
1951 145 150 178 163 172 178 199 199 184 162 146 166
1952 171 180 193 181 183 218 230 242 209 191 172 194
1953 196 196 236 235 229 243 264 272 237 211 180 201
print(type(flights_wide))
<class 'pandas.core.frame.DataFrame'>
sns.catplot(data=flights_wide, kind="box")
<seaborn.axisgrid.FacetGrid at 0x7fb040fc6be0>
../_images/0638d646ed3e01ce94d73d02894fad1b07404d0c49cbc155c0fd71ced733ab55.png

Chinese Fonts Issues#

  • Find system-compatible Chinese fonts using the terminal command:

!fc-list :lang=zh
  • Define the font to be used as well as the font properties in Python:

from matplotlib import rcParams
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
# rcParams['axes.unicode_minus']=False
myfont = FontProperties(fname='/Library/Fonts/Songti.ttc',
 size=15)
plt.title('圖表標題', fontproperties=myfont)
plt.ylabel('Y軸標題', fontproperties=myfont)
plt.legend(('分類一', '分類二', '分類三'), loc='best', prop=myfont)
  • For a permanent solution, please read references.

    • Modify the setting file in matplotlib: matplotlib.matplotlib_fname() to get the file path

    • It’s similar to: /Users/YOUR_NAME/opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/matplotlibrc

    • Two important parameters: font.family and font.serif

    • Add the font name under font.serif. My case: Source Han Sans

## One can set the font preference permanently
## in the setting file
import matplotlib
matplotlib.matplotlib_fname()
'/Users/Alvin/opt/anaconda3/envs/python-notes/lib/python3.7/site-packages/matplotlib/mpl-data/matplotlibrc'
from matplotlib import rcParams
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
# rcParams['axes.unicode_minus']=False
#/Users/alvinchen/Library/Fonts/SourceHanSans.ttc
#'/System/Library/Fonts/PingFang.ttc'
def getChineseFont(size=15):  
    return FontProperties(fname='/Users/Alvin/Library/Fonts/SourceHanSans.ttc',size=size)  

print(getChineseFont().get_name())
plt.title('圖表標題', fontproperties=getChineseFont(20))
plt.ylabel('Y軸標題', fontproperties=getChineseFont(12))
plt.legend(('分類一', '分類二', '分類三'), loc='best', prop=getChineseFont())
Source Han Sans
<matplotlib.legend.Legend at 0x7fb033c97588>
../_images/83d152ae74a37e8f33b3d0c5e78d225e33f103eeb593c208ea2762078819a786.png
## Permanent Setting Version
plt.rcParams['font.sans-serif']=["PingFang HK"]
plt.rcParams['axes.unicode_minus']= False

plt.plot((2,4,6), (3,5,7))
plt.title("中文標題")
plt.ylabel("y軸標題")
plt.xlabel("x軸標題")
plt.show()
../_images/1f1efbda8e0f9d2e390a0ad5057d83f93f06f0c794f976792f3a59e2929f6e2d.png
## Seaborn
sns.set(font=['san-serif'])
sns.set_style("whitegrid",{"font.sans-serif":["PingFang HK"]})
cities_counter = [('好棒', 285), ('給我', 225), ('不要', 163), ('細柔', 136), ('吃飯', 130), ('小小', 124), ('深圳', 88), ('溫州', 67), ('小知', 66), ('大之', 45)]
sns.set_color_codes("pastel")
sns.barplot(x=[k for k, _ in cities_counter[:10]], y=[v for _, v in cities_counter[:10]])
<AxesSubplot:>
../_images/78354d9ac5405338f7dbe5c4f5d856616fc3b3e36347eae25ca3c9903e9f7b11.png

References#

Requirements#

seaborn==0.11.0
pandas==1.1.2
numpy==1.18.1
matplotlib==3.3.2