Computational Exercises: Solutions > Analyzing the Contributions Data (g20)

contrib_clean.py

"""
contrib_clean.py
Spring 2021 PJW

Clean up the 2020 presidential contributions data
"""

import pandas as pd

#
#  Set up file names
#

contrib_file = 'contrib_by_zip.zip'
po_file = 'pocodes.csv'
pkl_file = 'contrib_clean.pkl'
com_total_file = 'com_total.csv'

#
#  Read the aggregated contributions as strings and then
#  convert the amount to a float.
#

contrib = pd.read_csv(contrib_file,dtype=str)
contrib['amt'] = contrib['amt'].astype(float)

#
#  Read the postal code data in order to select states
#

po = pd.read_csv(po_file)
po = po.drop(columns='Name')

#
#  Merge the postal data onto the contributions
#

contrib = contrib.merge(
    po,left_on='STATE',right_on='PO',how='outer',
    validate='m:1',indicator=True)

print( contrib['_merge'].value_counts() )

#
#  Identify records that have state codes that aren't in the
#  master post code file (islands, military post offices, etc).
#

state_bad = contrib['_merge'] != 'both'

#
#  Drop some columns we no longer need
#

contrib = contrib.drop(columns=['_merge','PO'])

#
#  Count up the records that have bad state codes and report how
#  many there are.
#

bad_recs = contrib[state_bad].groupby('STATE')
state_bad_amt = bad_recs['amt'].sum()

print( '\nContributions from state codes being dropped:\n' )
print( state_bad_amt )
print( state_bad_amt.sum() )

#
#  Keep only the good records
#

contrib = contrib[ state_bad == False ]

#
#  Now check for bad zip codes by finding any that aren't purely numeric.
#  Then print out some counts and keep only the records with good zips.
#

num_zip = pd.to_numeric( contrib['zip'], errors='coerce' )
zip_bad = num_zip.isna()

bad_recs = contrib[zip_bad].groupby('zip')
zip_bad_amt = bad_recs['amt'].sum()

print( '\nContributions from zip codes being dropped:\n' )
print( zip_bad_amt )
print( zip_bad_amt.sum() )

contrib = contrib[ zip_bad == False ]

#
#  Write the results out as a pickle file
#

contrib.to_pickle(pkl_file)

#
#  Aggregate contributions by committee (i.e., over all states and
#  zips) and save that as a CSV file.
#

by_com = contrib.groupby('CMTE_ID')
com_total = by_com['amt'].sum()
com_total.name = 'total_amt'
com_total.to_csv(com_total_file)
Site Index | Zoom | Admin
URL: https://wilcoxen.maxwell.insightworks.com/pages/5282.html
Peter J Wilcoxen, The Maxwell School, Syracuse University
Revised 04/26/2022