The Maxwell School
Syracuse University
Syracuse University
""" contrib_clean.py Spring 2021 PJW Clean up the 2020 presidential contributions data """ import pandas as pd # # Set up file names # contrib_file = 'contrib_by_zip.zip' po_file = 'pocodes.csv' pkl_file = 'contrib_clean.pkl' com_total_file = 'com_total.csv' # # Read the aggregated contributions as strings and then # convert the amount to a float. # contrib = pd.read_csv(contrib_file,dtype=str) contrib['amt'] = contrib['amt'].astype(float) # # Read the postal code data in order to select states # po = pd.read_csv(po_file) po = po.drop(columns='Name') # # Merge the postal data onto the contributions # contrib = contrib.merge( po,left_on='STATE',right_on='PO',how='outer', validate='m:1',indicator=True) print( contrib['_merge'].value_counts() ) # # Identify records that have state codes that aren't in the # master post code file (islands, military post offices, etc). # state_bad = contrib['_merge'] != 'both' # # Drop some columns we no longer need # contrib = contrib.drop(columns=['_merge','PO']) # # Count up the records that have bad state codes and report how # many there are. # bad_recs = contrib[state_bad].groupby('STATE') state_bad_amt = bad_recs['amt'].sum() print( '\nContributions from state codes being dropped:\n' ) print( state_bad_amt ) print( state_bad_amt.sum() ) # # Keep only the good records # contrib = contrib[ state_bad == False ] # # Now check for bad zip codes by finding any that aren't purely numeric. # Then print out some counts and keep only the records with good zips. # num_zip = pd.to_numeric( contrib['zip'], errors='coerce' ) zip_bad = num_zip.isna() bad_recs = contrib[zip_bad].groupby('zip') zip_bad_amt = bad_recs['amt'].sum() print( '\nContributions from zip codes being dropped:\n' ) print( zip_bad_amt ) print( zip_bad_amt.sum() ) contrib = contrib[ zip_bad == False ] # # Write the results out as a pickle file # contrib.to_pickle(pkl_file) # # Aggregate contributions by committee (i.e., over all states and # zips) and save that as a CSV file. # by_com = contrib.groupby('CMTE_ID') com_total = by_com['amt'].sum() com_total.name = 'total_amt' com_total.to_csv(com_total_file)