The Maxwell School
Syracuse University
Syracuse University
""" figures.py Spring 2022 PJW Plot a range of figures using Seaborn. """ import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # # Set the default resolution and Seaborn style # plt.rcParams['figure.dpi'] = 300 sns.set_theme(style='white') # # Read the dataset # pv = pd.read_pickle('ca_csi_2020_pkl.zip') # # Describe it briefly # print('\nDataframe information:\n') pv.info() #%% # # Examine the categories # catvars = ['app_status','sector','state','inst_status','type'] for var in catvars: print( f'\n{var}:\n') print( pv[var].value_counts() ) fig = sns.catplot(y=var,data=pv,kind='count') #%% # # Focus on a subset of the data # res = pv.query("sector == 'Residential'") res = res.query("app_status == 'Completed'") res = res.query("inst_status == 'Installed'") print('\nOriginal records:',len(pv)) print('Trimmed records:',len(res)) res['year'] = res['year'].astype(int) #%% # # Check interesting variables in trimmed dataset; save the figures # while we're at it. # for var in ['third_party','year']: fig = sns.catplot(y=var,data=res,kind='count') fig.savefig(f'res_{var}.png') #%% n_last = len(res) res = res.dropna( subset=['nameplate','total_cost'] ) n_now = len(res) print( 'records dropped due to missing data', n_last-n_now) #%% # # Trim the data set down to more typical residential # installations. # fig, (ax1,ax2) = plt.subplots(1,2) res['nameplate'].plot.hist(ax=ax1) ax1.set_title('Nameplate') res['total_cost'].plot.hist(ax=ax2) ax2.set_title('Cost') fig.tight_layout() #%% # # Trim off projects with sizes or costs above the 99th quantile # kw99 = res['nameplate'].quantile(0.99) tc99 = res['total_cost'].quantile(0.99) print( '\n99th quantiles of nameplate and total_cost:\n' ) print( kw99, tc99 ) trim = res.query(f"nameplate <= {kw99} and total_cost <= {tc99}") print( "records after trimming large projects", len(trim) ) fig, (ax1,ax2) = plt.subplots(1,2) trim['nameplate'].plot.hist(ax=ax1) ax1.set_title('Nameplate') trim['total_cost'].plot.hist(ax=ax2) ax2.set_title('Cost') fig.tight_layout() fig.savefig('nameplate_cost.png') #%% # # Draw the new histograms and save them along the way. # for var in ['nameplate','total_cost']: fig, ax1 = plt.subplots() sns.histplot(data=trim,x=var,hue='third_party',kde=True,ax=ax1) fig.tight_layout() fig.savefig(f'res_{var}.png') #%% # # Other ways to look at the distribution # fig, ax1 = plt.subplots() sns.boxenplot(data=trim,x='third_party',y='nameplate',ax=ax1) ax1.set_title("Nameplate Capacity") ax1.set_xlabel("Third Party") ax1.set_ylabel("kW") fig.tight_layout() fig.savefig('res_boxen_all.png') fig, ax1 = plt.subplots() sns.violinplot(data=trim,x='nameplate',y='inst_status',hue='third_party',split=True,ax=ax1) ax1.set_title("Nameplate Capacity") ax1.set_xlabel("kW") ax1.set_ylabel("") fig.tight_layout() fig.savefig('res_violin.png') # # Overlaid kernel density plots # fig, ax1 = plt.subplots() sns.kdeplot(data=trim,x='nameplate', hue='third_party', palette='crest', fill=True, ax=ax1) ax1.set_title("Nameplate Capacity") ax1.set_xlabel("kW") fig.tight_layout() fig.savefig('res_kde.png') #%% # # More detail: nameplate by year # main = trim.query("year <= 2016") fig, ax1 = plt.subplots() sns.boxenplot(data=main,y='year',x='nameplate',orient='h',ax=ax1) ax1.set_title("Nameplate Capacity by Year") ax1.set_xlabel("kW") ax1.set_ylabel("Year") fig.tight_layout() fig.savefig('res_boxen_year.png') #%% # # Plot the joint distribution using a hex plot # jg = sns.jointplot(data=trim, x='nameplate', y='total_cost', kind='hex') jg.set_axis_labels("Nameplate","Total Cost") jg.fig.suptitle('Distribution of Systems by Cost and Capacity') jg.fig.tight_layout() jg.savefig('res_hexbin.png')