Computational Exercises: Solutions > Introduction to Seaborn (g18)
figures.py

"""
figures.py
Spring 2022 PJW

Plot a range of figures using Seaborn.
"""

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#
#  Set the default resolution and Seaborn style
#

plt.rcParams['figure.dpi'] = 300
sns.set_theme(style='white')

#
#  Read the dataset
#

pv = pd.read_pickle('ca_csi_2020_pkl.zip')

#
#  Describe it briefly
#

print('\nDataframe information:\n')
pv.info()

#%%
#
#  Examine the categories
#

catvars = ['app_status','sector','state','inst_status','type']

for var in catvars:
    print( f'\n{var}:\n')
    print( pv[var].value_counts() )
    fig = sns.catplot(y=var,data=pv,kind='count')

#%%

#
#  Focus on a subset of the data
#

res = pv.query("sector == 'Residential'")
res = res.query("app_status == 'Completed'")
res = res.query("inst_status == 'Installed'")

print('\nOriginal records:',len(pv))
print('Trimmed records:',len(res))

res['year'] = res['year'].astype(int)

#%%
#
#  Check interesting variables in trimmed dataset; save the figures
#  while we're at it.
#

for var in ['third_party','year']:
    fig = sns.catplot(y=var,data=res,kind='count')
    fig.savefig(f'res_{var}.png')

#%%

n_last = len(res)
res = res.dropna( subset=['nameplate','total_cost'] )
n_now = len(res)
print( 'records dropped due to missing data', n_last-n_now)

#%%
#
#  Trim the data set down to more typical residential
#  installations.
#

fig, (ax1,ax2) = plt.subplots(1,2)
res['nameplate'].plot.hist(ax=ax1)
ax1.set_title('Nameplate')
res['total_cost'].plot.hist(ax=ax2)
ax2.set_title('Cost')
fig.tight_layout()

#%%
#
#  Trim off projects with sizes or costs above the 99th quantile
#

kw99 = res['nameplate'].quantile(0.99)
tc99 = res['total_cost'].quantile(0.99)

print( '\n99th quantiles of nameplate and total_cost:\n' )
print( kw99, tc99 )

trim = res.query(f"nameplate <= {kw99} and total_cost <= {tc99}")

print( "records after trimming large projects", len(trim) )

fig, (ax1,ax2) = plt.subplots(1,2)
trim['nameplate'].plot.hist(ax=ax1)
ax1.set_title('Nameplate')
trim['total_cost'].plot.hist(ax=ax2)
ax2.set_title('Cost')
fig.tight_layout()
fig.savefig('nameplate_cost.png')

#%%

#
#  Draw the new histograms and save them along the way.
#

for var in ['nameplate','total_cost']:
    fig, ax1 = plt.subplots()
    sns.histplot(data=trim,x=var,hue='third_party',kde=True,ax=ax1)
    fig.tight_layout()
    fig.savefig(f'res_{var}.png')

#%%

#
#  Other ways to look at the distribution
#

fig, ax1 = plt.subplots()
sns.boxenplot(data=trim,x='third_party',y='nameplate',ax=ax1)
ax1.set_title("Nameplate Capacity")
ax1.set_xlabel("Third Party")
ax1.set_ylabel("kW")
fig.tight_layout()
fig.savefig('res_boxen_all.png')

fig, ax1 = plt.subplots()
sns.violinplot(data=trim,x='nameplate',y='inst_status',hue='third_party',split=True,ax=ax1)
ax1.set_title("Nameplate Capacity")
ax1.set_xlabel("kW")
ax1.set_ylabel("")
fig.tight_layout()
fig.savefig('res_violin.png')

#
#  Overlaid kernel density plots
#

fig, ax1 = plt.subplots()
sns.kdeplot(data=trim,x='nameplate',
            hue='third_party',
            palette='crest',
            fill=True,
            ax=ax1)
ax1.set_title("Nameplate Capacity")
ax1.set_xlabel("kW")
fig.tight_layout()
fig.savefig('res_kde.png')

#%%
#
#  More detail: nameplate by year
#

main = trim.query("year <= 2016")

fig, ax1 = plt.subplots()
sns.boxenplot(data=main,y='year',x='nameplate',orient='h',ax=ax1)
ax1.set_title("Nameplate Capacity by Year")
ax1.set_xlabel("kW")
ax1.set_ylabel("Year")
fig.tight_layout()
fig.savefig('res_boxen_year.png')

#%%
#
#  Plot the joint distribution using a hex plot
#

jg = sns.jointplot(data=trim, x='nameplate', y='total_cost', kind='hex')
jg.set_axis_labels("Nameplate","Total Cost")
jg.fig.suptitle('Distribution of Systems by Cost and Capacity')
jg.fig.tight_layout()
jg.savefig('res_hexbin.png')
Site Index | Zoom | Admin
URL: https://wilcoxen.maxwell.insightworks.com/pages/7574.html
Peter J Wilcoxen, The Maxwell School, Syracuse University
Revised 04/26/2022