The Maxwell School
Syracuse University
Syracuse University
""" dissim.py Spring 2022 PJW Calculate dissimilarity indexes for large US counties. """ import pandas as pd import matplotlib.pyplot as plt # # Read the demographic data about blockgroups # fips = {'state':str,'county':str,'bg':str} dat = pd.read_csv('append.csv',dtype=fips) dat = dat.set_index(['state','county','bg']) # # Compute population totals by county # by_co = dat.groupby(['state','county']) by_co_tot = by_co.sum() races = by_co_tot.columns # # Compute population shares by county # shr = dat/by_co_tot check = shr.groupby(['state','county']).sum() print( '\nChecking shares in 20 random counties:\n') print( check.sample(20) ) # # Compute the dissimilarity index for each county # abs_diff = abs( shr['white'] - shr['nonwhite'] ) dissim = 100*0.5*abs_diff.groupby(['state','county']).sum() #%% # # Start building a dataframe of information about counties # all_co_results = by_co_tot.copy() all_co_results['num_bg'] = by_co.size() all_co_results['dissim'] = dissim.round(2) #%% # # Compute and print total population by race, in millions # tot_pop = all_co_results[races].sum()/1e6 print( '\nTotal population:\n') print( tot_pop ) #%% # # Select the large counties # large_co_results = all_co_results.query( "num_bg >= 50 and nonwhite >= 10000" ) # # Compute and print the large county share of total population by race # large_pop = large_co_results[races].sum()/1e6 print( '\nPopulation and population share for large counties:\n') print( large_pop ) print( 100*large_pop/tot_pop ) #%% # # Merge on the names of the counties # names = pd.read_csv('county_names.csv',dtype=str) res = large_co_results.merge(names, on=['state','county'], how='left', validate='1:1', indicator=True) print( '\nChecking merge of county names:\n') print( res['_merge'].value_counts() ) res = res.drop(columns='_merge') # # Sort by dissimilarity index and write out the results # res = res.sort_values('dissim') res.to_csv('dissim.csv',index=False) # # Print NYS information # nys = res.query("state=='36'") print( '\nNY counties from lowest dissimilarity to highest:\n') print( nys ) #%% # # Set up bins that round dissimilarity to the tens place # res['bin'] = res['dissim'].round(-1) by_bin = res.groupby('bin') pop_by_bin = by_bin[races].sum()/1e6 pct_by_bin = 100*pop_by_bin/large_pop print( '\nPopulation by bin:\n') print( pop_by_bin ) print( '\nPercent of population by bin:\n') print( pct_by_bin ) #%% # # Draw a figure showing the portion of the population in # counties in each dissimilarity bin # fig1, ax1 = plt.subplots(dpi=300) bars = ['white','nonwhite'] pct_by_bin[bars].plot.bar(ax=ax1) fig1.suptitle("Degree of Segregation in Large US Counties") ax1.set_xlabel('Dissimilarity Index') ax1.set_ylabel('Percent of Overall Population') fig1.tight_layout() fig1.savefig('pop_by_bin.png')