Computing County Dissimilarity Indexes (g16)

# dissim.py

"""
dissim.py
Spring 2022 PJW

Calculate dissimilarity indexes for large US counties.
"""

import pandas as pd
import matplotlib.pyplot as plt

#
#  Read the demographic data about blockgroups
#

fips = {'state':str,'county':str,'bg':str}

dat = dat.set_index(['state','county','bg'])

#
#  Compute population totals by county
#

by_co = dat.groupby(['state','county'])
by_co_tot = by_co.sum()

races = by_co_tot.columns

#
#  Compute population shares by county
#

shr = dat/by_co_tot

check = shr.groupby(['state','county']).sum()

print( '\nChecking shares in 20 random counties:\n')
print( check.sample(20) )

#
#  Compute the dissimilarity index for each county
#

abs_diff = abs( shr['white'] - shr['nonwhite'] )
dissim = 100*0.5*abs_diff.groupby(['state','county']).sum()

#%%
#
#  Start building a dataframe of information about counties
#

all_co_results = by_co_tot.copy()
all_co_results['num_bg'] = by_co.size()
all_co_results['dissim'] = dissim.round(2)

#%%
#
#  Compute and print total population by race, in millions
#

tot_pop = all_co_results[races].sum()/1e6

print( '\nTotal population:\n')
print( tot_pop )

#%%
#
#  Select the large counties
#

large_co_results = all_co_results.query( "num_bg >= 50 and nonwhite >= 10000" )

#
#  Compute and print the large county share of total population by race
#

large_pop = large_co_results[races].sum()/1e6

print( '\nPopulation and population share for large counties:\n')
print( large_pop )
print( 100*large_pop/tot_pop )

#%%
#
#  Merge on the names of the counties
#

res = large_co_results.merge(names,
on=['state','county'],
how='left',
validate='1:1',
indicator=True)

print( '\nChecking merge of county names:\n')
print( res['_merge'].value_counts() )
res = res.drop(columns='_merge')

#
#  Sort by dissimilarity index and write out the results
#

res = res.sort_values('dissim')
res.to_csv('dissim.csv',index=False)

#
#  Print NYS information
#

nys = res.query("state=='36'")

print( '\nNY counties from lowest dissimilarity to highest:\n')
print( nys )

#%%
#
#  Set up bins that round dissimilarity to the tens place
#

res['bin'] = res['dissim'].round(-1)

by_bin = res.groupby('bin')
pop_by_bin = by_bin[races].sum()/1e6
pct_by_bin = 100*pop_by_bin/large_pop

print( '\nPopulation by bin:\n')
print( pop_by_bin )

print( '\nPercent of population by bin:\n')
print( pct_by_bin )

#%%
#
#  Draw a figure showing the portion of the population in
#  counties in each dissimilarity bin
#

fig1, ax1 = plt.subplots(dpi=300)

bars = ['white','nonwhite']
pct_by_bin[bars].plot.bar(ax=ax1)

fig1.suptitle("Degree of Segregation in Large US Counties")
ax1.set_xlabel('Dissimilarity Index')
ax1.set_ylabel('Percent of Overall Population')
fig1.tight_layout()
fig1.savefig('pop_by_bin.png')


URL: https://wilcoxen.maxwell.insightworks.com/pages/7443.html
Peter J Wilcoxen, The Maxwell School, Syracuse University
Revised 03/27/2022