Mon 3/15

pandas-intro.py

#! /bin/python
#  Mar 21 (PJW)
#
#  Demonstration file introducing Pandas

import pandas as pd
import json

#
#  Initial data: a dictionary of days in each month
#

d = { 'January':31, 'February':28, 'March':31,
      'April':30,   'May':31,      'June':30,
      'July':31,    'August':31,   'September':30,
      'October':31, 'November':30, 'December':31 }

print( '\nDictionary:\n' )
print( json.dumps(d,indent=4) )

#%%
#
#  Build a Series object from it
#

s = pd.Series(d)

print( '\nSeries:\n')
print( s )

print( s.index )
print( list(s.index) )

#%%
#
#  Print one or more elements
#

print( '\nMarch:', s['March'] )

this_quarter = ['March','April','May']
print( s[this_quarter] )

#%%
#
#  Use Series methods to do some quick calculations
#

print( '\nTotal days:', s.sum() )

print( '\nMean days per month:', s.mean() )

#%%
#
#  Ask for a range of summary statistics
#

stats = s.describe()
print( stats )

#  Result is a Series: can extract pieces

p25 = stats['25%']
p75 = stats['75%']

print( '\nIQR:', p75 - p25  )

#%%
#
#  Calculate a new series from an existing one
#

pct = 100*s/s.sum()

print( "\nEach month as a percent of the year:\n" )
print( pct.round(2) )

#%%

by_name = pct.sort_index()
print( "\nSorted by name:\n" )
print( by_name )

by_days = pct.sort_values()
print( "\nSorted by days:\n" )
print( by_days )

#%%
#
#  Do some quick plots. We'll talk more about matplotlib later but for
#  now the main thing is that we need plt.figure() to start a new figure
#  for each plot. Otherwise, they will be drawn on top of one another.
#

import matplotlib.pyplot as plt

plt.figure()
ax = s.plot.barh()

plt.figure()
ax = s.plot.hist()

fig = plt.figure()
ax = s.plot.pie()

#%%
#
#  Now combine some series to make a DataFrame. The series are automatically
#  joined by aligning their indexes
#

print( s )
print( by_days )

f = pd.DataFrame()
f['days'] = s
f['percent'] = by_days

print( f )

#
#  Index and columns of the result
#

print( list(f.index) )
print( list(f.columns) )

#
#  Adding lists (not series) is done purely by element order
#

month_numbers = list( range(1,13) )
print( month_numbers )

f['number'] = month_numbers

print( f )

#%%
#
#  Now read in some CSV data. This is monthly average electricity
#  use for the household in the previous assignment.
#

kw = pd.read_csv('use_kw.csv')

print( '\nkW data:\n' )
print( kw )

#
#  Set the index to the month
#

kw = kw.set_index('mo')

print( kw )

#
#  Sort by the index and plot lines
#

kw = kw.sort_index()

print( kw )

lines_to_plot = ['p95','mean','median','p05']

plt.figure()
ax = kw[lines_to_plot].plot.line()

#%%

kw['m+2s'] = kw['mean'] + 2*kw['std']
kw['m-2s'] = kw['mean'] - 2*kw['std']

lines_to_plot = ['m+2s','p95','mean','p05','m-2s']

plt.figure()
ax = kw[lines_to_plot].plot.line()


Site Index | Zoom | Admin
URL: https://wilcoxen.maxwell.insightworks.com/pages/6201.html
Peter J Wilcoxen, The Maxwell School, Syracuse University
Revised 02/28/2022