Working with Large Zip Files (g11)

pecan.py

"""
pecan.py
Spring 2022 PJW

Aggregate and analyze high resolution electricity data from the 
Pecan Street Project.
"""

import csv
import io
import zipfile
import numpy as np
from collections import defaultdict 

#
#  Set up file names
#

zipname = 'id114_2014.zip'
csvname = 'id114_2014.csv'
outname = 'usage.csv'

#
#  Create an object for managing the zip file and then open 
#  the CSV file inside it. Finally, set up a CSV reader for 
#  the file.
#

zip_object = zipfile.ZipFile(zipname)
inp_byte   = zip_object.open(csvname) 
inp_handle = io.TextIOWrapper(inp_byte)

inp_reader = csv.DictReader(inp_handle)

#
#  Open the output file
#

out_handle = open(outname,'w',newline='')
out_writer = csv.writer(out_handle)

#
#  Loop through the data in the input file collecting
#  information about each hour.
#

hourly = defaultdict(list)

for rec in inp_reader:

    #  Skip any records with missing data

    if rec['use'] == '':
        continue
    
    #  Get the timestamp and usage

    ts = rec['localminute']
    use = float( rec['use'] )

    #  Figure out the month, day, and hour of the data and
    #  use that to make a key for hourly.

    (date,time) = ts.split()
    (mo,dy,yr) = date.split('/')
    (hr,mi) = time.split(':')

    hour = (int(mo),int(dy),int(hr))

    #  Add the data to its group
    
    hourly[ hour ].append( use )
    
#
#  Now go through the data and compute average usage for 
#  each hour. Print it out with its month, day, and hour
#  

out_writer.writerow(['month','day','hour','usage'])

averages = []
for key in sorted(hourly):

    values = hourly[ key ]

    if len(values) > 60:
        print(f'Dropping record for {key}, has >60 values\n')
        continue
    
    avg = round( np.mean(values), 3 )

    (m,d,h) = key
    out_writer.writerow([m,d,h,avg])

    averages.append(avg)


out_handle.close()

#
#  Compute key percentiles of hourly average
#

pctiles = [1,5,10,25,50,75,90,95,99]
pctvals = np.percentile(averages,pctiles)

for (pct,val) in zip(pctiles,pctvals):
    print( f"{pct:2d} %: {val:4.3f} kW" )

#%%
#
#  Not required. Calculates the answers to the questions in results.md 
#  but it's fine to calculate them with a calculator.
#

pct = dict( zip(pctiles,pctvals) )

iqr = pct[75]-pct[25]
median = pct[50]

rel_iqr = round(iqr/median,3)
rel_25 = round(pct[25]/median,3)
rel_75 = round(pct[75]/median,3)
rel_99 = round(pct[99]/median,3)

print( f'\nQ1: median = {median}')
print( f'Q2: IQR = {iqr} or {pct[25]} to {pct[75]}')
print( f'Q3: IQR/median = {rel_iqr} or {rel_25} to {rel_75}')
print( f'Q4: 90% CI = [{pct[5]},{pct[95]}]')
print( f'Q5: 99th/median: {rel_99}')
Site Index | Zoom | Admin
URL: https://wilcoxen.maxwell.insightworks.com/pages/6210.html
Peter J Wilcoxen, The Maxwell School, Syracuse University
Revised 02/26/2022