May 24, 2021
Katrina Sostek
Work done as part of a Google.org Fellowship with the Satcher Health Leadership Institute at the Morehouse School of Medicine
#@title
# How to update the data:
# 1. You may need to copy this colab so you have your own version.
# 2. Update the table name constants below to have the latest data's suffix.
# 3. Update the date variables below to be the last case date included in the data.
# 4. Updates the scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
# Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
# the paper including text and tables, a lot of that is done in commented out PrintSummaryStats() statements.
import pandas as pd
import altair as alt
from vega_datasets import data
from google.colab import auth
auth.authenticate_user()
# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
pd.options.display.float_format = '{:,.2f}'.format
# Project and table names.
PROJECT_ID = 'msm-secure-data-1b'
CDC_TABLE = '`%s.ndunlap_secure.cdc_restricted_access_20210430`' % PROJECT_ID
CRDT_TABLE = '`%s.ndunlap_secure.crdt_20210307`' % PROJECT_ID
NCHS_STATES_TABLE = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210414`'
NCHS_COUNTIES_TABLE = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_county_20210414`'
NCHS_STATES_CRDT_COMPARE_TABLE = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210303`'
# Dates in different formats.
DATE = 'DATE(2021, 04, 15)'
DATE_DISPLAY_NAME = 'Apr 15'
CRDT_DATE = '20210307'
CRDT_COMPARE_DATE = 'DATE(2021, 03, 07)'
CRDT_COMPARE_DATE_DISPLAY_NAME = 'Mar 7'
# NCHS deaths data dates.
NCHS_DATE = 'DATE(2021, 4, 14)'
NCHS_DATE_DISPLAY_NAME = 'Apr 14'
CRDT_NCHS_DATE = '20210303'
CRDT_COMPARE_NCHS_DATE = 'DATE(2021, 03, 03)'
CRDT_COMPARE_NCHS_DATE_DISPLAY_NAME = 'Mar 3'
# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
TOTAL_CASES_SCALE_MAX = 70000
COUNTY_CASES_SCALE_MAX = 24000
COUNTY_CASES_ZOOM_SCALE_MAX = 3000
CASES_RACE_SCALE_MAX = 60000 # known race/ethnicity
# Chart settings.
SCATTER_HEIGHT = 300
SCATTER_WIDTH = 300
MAP_HEIGHT = 300
MAP_WIDTH = 450
US_STATES_TOPO = alt.topo_feature(data.us_10m.url, 'states')
US_COUNTIES_TOPO = alt.topo_feature(data.us_10m.url+"#", 'counties')
TERRITORIES = ('PR', 'GU', 'VI', 'MP', 'AS')
NYT_TERRITORIES = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
STATES_TO_FIPS = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78, 'NYC': 36}
FIPS_TO_STATES = {STATES_TO_FIPS[key]: key for key in STATES_TO_FIPS}
STATES_TO_ABBREVIATIONS = {'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
RACE_ETHNICITY_COMBINED_MAP = {
'Asian, Non-Hispanic': 'asian_cases',
'Black, Non-Hispanic': 'black_cases',
'White, Non-Hispanic': 'white_cases',
'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
'Hispanic/Latino': 'hispanic_cases',
'Multiple/Other, Non-Hispanic': 'other_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
'Missing': 'unknown_cases',
'Unknown': 'unknown_cases',
'NA': 'na_cases',
}
RACE_ETHNICITY_COMBINED_KNOWN_MAP = {
'Asian, Non-Hispanic': 'cdc_known_cases',
'Black, Non-Hispanic': 'cdc_known_cases',
'White, Non-Hispanic': 'cdc_known_cases',
'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
'Hispanic/Latino': 'cdc_known_cases',
'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
'Missing': 'cdc_unknown_cases',
'Unknown': 'cdc_unknown_cases',
'NA': 'cdc_na_cases',
}
#@title
CDC_OVERALL_RACE_QUERY = ('''
SELECT
race_ethnicity_combined,
COUNT(*) as cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
1
''' % CDC_TABLE)
CRDT_QUERY_STR = ('''
SELECT
State as state,
Deaths_Total as crdt_cases,
Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
date = %s
''')
CRDT_QUERY = CRDT_QUERY_STR % (CRDT_TABLE, CRDT_DATE)
CRDT_NCHS_QUERY = CRDT_QUERY_STR % (CRDT_TABLE, CRDT_NCHS_DATE)
NYT_STATES_QUERY_STR = ('''
SELECT
state_name,
state_fips_code,
deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
date = %s AND
state_fips_code IS NOT NULL
''')
NYT_STATES_QUERY = NYT_STATES_QUERY_STR % DATE
NYT_STATES_COMPARE_CRDT_QUERY = NYT_STATES_QUERY_STR % CRDT_COMPARE_DATE
NYT_STATES_NCHS_QUERY = NYT_STATES_QUERY_STR % NCHS_DATE
CDC_STATES_QUERY = ('''
SELECT
res_state,
COUNT(*) as cdc_cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
res_state
''' % CDC_TABLE)
NYT_COUNTIES_QUERY_STR = ('''
SELECT
county_fips_code,
deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
date = %s AND
county_fips_code IS NOT NULL
''')
NYT_COUNTIES_QUERY = NYT_COUNTIES_QUERY_STR % DATE
NYT_COUNTIES_NCHS_QUERY = NYT_COUNTIES_QUERY_STR % NCHS_DATE
CDC_COUNTIES_RACE_QUERY = ('''
SELECT
res_state,
res_county,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % CDC_TABLE)
COUNTY_FIPS_MAPPING_QUERY = ('''
SELECT
*
FROM
`msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')
ACS_POPULATION_DATA_QUERY = ('''
SELECT
state,
county,
county_fips,
total_pop
FROM
`msm-internal-data.ipums_acs.acs_2019_5year_county`
''')
CDC_STATES_RACE_QUERY = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
res_state,
race_ethnicity_combined
''' % CDC_TABLE)
CDC_STATES_RACE_UP_TO_CRDT_QUERY = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
WHERE
death_yn = 'Yes' AND
cdc_case_earliest_dt <= %s
GROUP BY
res_state,
race_ethnicity_combined
''' % (CDC_TABLE, CRDT_COMPARE_DATE))
NCHS_STATES_OVERALL_QUERY = ('''
SELECT
*
FROM
%s
WHERE
Indicator = "Distribution of COVID-19 deaths (%%)" AND
State = 'United States'
''' % NCHS_STATES_TABLE)
NCHS_STATE_COUNTS_OVERALL_QUERY = ('''
SELECT
*
FROM
%s
WHERE
Indicator = "Count of COVID-19 deaths" AND
State = 'United States'
''' % NCHS_STATES_TABLE)
# Group = "By Total" added for April 28 file.
# `Group` = "By Total" AND
NCHS_STATES_QUERY_STR = ('''
SELECT
*
FROM %s
WHERE
Indicator = "Distribution of COVID-19 deaths (%%)" AND
State != 'United States'
''')
# Group = "By Total" added for April 28 file.
# `Group` = "By Total" AND
NCHS_STATE_COUNTS_QUERY_STR = ('''
SELECT
*
FROM %s
WHERE
Indicator = "Count of COVID-19 deaths" AND
State != 'United States'
''')
NCHS_COUNTIES_QUERY = ('''
SELECT
*
FROM %s
WHERE
Indicator = "Distribution of COVID-19 deaths (%%)"
''' % NCHS_COUNTIES_TABLE)
#@title
def FieldAnalysis(project_id, table, field_list, title, calculate_race_ethnicity=False):
dict = {}
for field in field_list:
dict[field] = [0.0, 0.0, 0.0, 0.0]
unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
field_series = []
value_series = []
percent_series = []
cases_series = []
chart_denominator = 1000
for field in field_list:
field_unknowns_query = ('''
SELECT
%s,
count(*) as cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
%s
''')
if calculate_race_ethnicity and field == 'race_ethnicity_combined':
field_unknowns_query = ('''
SELECT
CASE ethnicity = "Non-Hispanic/Latino"
WHEN true THEN race
ELSE ethnicity
END as %s,
count(*) as cases
FROM
%s
GROUP BY
%s
''')
query = field_unknowns_query % (field, table, field)
field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
field_unknowns_df.set_index(field, inplace=True)
field_unknowns_df.index = field_unknowns_df.index.fillna('Null')
field_display_name = {
'cdc_case_earliest_dt': 'CDC earliest case date',
'current_status': 'Case status',
'res_state': 'State',
'res_county': 'County',
'sex': 'Sex',
'age_group': 'Age',
'race_ethnicity_combined': 'Race/Ethnicity',
'race': 'Race',
'ethnicity': 'Ethnicity',
'case_month': 'Case month'
}
missing_count = 0
if 'Missing' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Missing'].cases
if 'Null' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Null'].cases
if '' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc[''].cases
if 'OTH' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['OTH'].cases
if 'nul' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['nul'].cases
unknowns.loc['Missing', field] = missing_count
if 'Unknown' in field_unknowns_df.index:
unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases
if 'NA' in field_unknowns_df.index:
unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases
unknowns.loc['Known', field] = field_unknowns_df.cases.sum() - (
unknowns.loc['Missing', field] +
unknowns.loc['Unknown', field] +
unknowns.loc['NA', field])
field_series.extend([field_display_name.get(field, field)] * 4)
value_series.extend(['Known', 'Suppressed', 'Unknown', 'Missing'])
percent_series.extend([unknowns.loc['Known', field] / field_unknowns_df.cases.sum(),
unknowns.loc['NA', field] / field_unknowns_df.cases.sum(),
unknowns.loc['Unknown', field] / field_unknowns_df.cases.sum(),
unknowns.loc['Missing', field] / field_unknowns_df.cases.sum()])
cases_series.extend([unknowns.loc['Known', field] / chart_denominator,
unknowns.loc['NA', field] / chart_denominator,
unknowns.loc['Unknown', field] / chart_denominator,
unknowns.loc['Missing', field] / chart_denominator])
bars = pd.DataFrame.from_dict({'field': field_series,
'value': value_series,
'percent': percent_series,
'cases': cases_series})
return alt.Chart(bars).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
y=alt.Y('field', sort='x', title='Field'),
color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
order=alt.Order('field:N'),
tooltip=[
alt.Tooltip('field:N', title='Field'),
alt.Tooltip('value:N', title='Value'),
alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
]
).properties(title=title)
def CreateNYTStateDataframe(query, include_territories=False):
nyt_states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
if not include_territories:
for territory in NYT_TERRITORIES:
nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)
return nyt_states_df
def CreateCDCStateDataframe(query):
states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df.rename(columns={'res_state': 'state'}, inplace=True)
states_df['state_fips_code'] = states_df.state
states_df = states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
states_df['state_fips_code'] = states_df.state_fips_code.astype(int)
states_df.set_index('state_fips_code', inplace=True)
return states_df
def CreateCDCStateRaceDataframe(query, cases_field_prefix, include_territories=False):
states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_KNOWN_MAP})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
cases_field = cases_field_prefix + 'cases'
crosstab_df = pd.crosstab(states_df['state'],
states_df.race_ethnicity_combined,
values=states_df[cases_field],
aggfunc=sum,
margins=True,
margins_name=cases_field
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df[cases_field_prefix + 'known_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df[cases_field_prefix + 'known_or_na_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df[cases_field_prefix + 'known_cases_percent'] = round(
crosstab_df[cases_field_prefix + 'known_cases'] /
crosstab_df[cases_field], 4)
crosstab_df[cases_field_prefix + 'known_or_na_cases_percent'] = round(
crosstab_df[cases_field_prefix + 'known_or_na_cases'] /
crosstab_df[cases_field], 4)
crosstab_df['state_fips_code'] = crosstab_df.state
crosstab_df = crosstab_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
# Remove territories and missing states for calculating summary stats.
if not include_territories:
for territory in TERRITORIES:
crosstab_df = crosstab_df[crosstab_df.state != territory]
crosstab_df = crosstab_df[crosstab_df.state != 'NA']
crosstab_df = crosstab_df[crosstab_df.state != 'Missing']
crosstab_df = crosstab_df[crosstab_df.state != 'Unknown']
crosstab_df.set_index('state_fips_code', inplace=True)
return crosstab_df
def CreateCRDTStateRaceDataframe(query):
crdt_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in TERRITORIES:
crdt_df = crdt_df[crdt_df.state != territory]
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_df['state_fips_code'] = crdt_df.state_fips_code.astype(int)
crdt_df.set_index('state_fips_code', inplace=True)
return crdt_df
def CreateNYTCountyDataframe(query):
nyt_counties_df = pd.io.gbq.read_gbq(NYT_COUNTIES_QUERY, project_id=PROJECT_ID)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)
return nyt_counties_df
def CreateCDCCountyRaceDataframe(query, cases_field_prefix):
cdc_counties_race_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in TERRITORIES:
cdc_counties_race_df = cdc_counties_race_df[cdc_counties_race_df.res_state != territory]
county_fips_map_df = pd.io.gbq.read_gbq(COUNTY_FIPS_MAPPING_QUERY, project_id=PROJECT_ID)
county_fips_map_df.cdc_county = county_fips_map_df.cdc_county.str.lower()
county_fips_map_df['state_county'] = county_fips_map_df.state + '-' + county_fips_map_df.cdc_county
county_fips_map_df['state_county'] = county_fips_map_df.state_county.astype('string').str.strip()
county_fips_map_df.set_index('state_county', inplace=True)
# Concatenate the state and county names because county names are not unique across states.
cdc_counties_race_df.res_county = cdc_counties_race_df.res_county.str.lower()
cdc_counties_race_df['state_county'] = cdc_counties_race_df.res_state + '-' + cdc_counties_race_df.res_county
cdc_counties_race_df['state_county'] = cdc_counties_race_df.state_county.astype('string').str.strip()
cdc_counties_race_df.set_index('state_county', inplace=True)
cdc_counties_race_df['race_ethnicity_combined'] = cdc_counties_race_df.race_ethnicity_combined.astype('string').str.strip()
cdc_counties_race_df = cdc_counties_race_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
# Printed value used in the footnotes below.
# All other checks for county_fips_code mappings are now in
# https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006
mismatches_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'missing']
# print(mismatches_df.cases.sum())
cdc_counties_race_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')
cases_field = cases_field_prefix + 'cases'
# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
cdc_counties_race_crosstab_df = pd.crosstab(cdc_counties_race_df['county_fips'],
cdc_counties_race_df.race_ethnicity_combined,
values=cdc_counties_race_df[cases_field],
aggfunc=sum,
margins=True,
margins_name=cases_field
)
# Have to reset_index() to go from pandas multi-index to single index.
cdc_counties_race_crosstab_df = cdc_counties_race_crosstab_df.reset_index()
cdc_counties_race_crosstab_df.drop(axis=0, index=len(cdc_counties_race_crosstab_df) - 1, inplace=True)
cdc_counties_race_crosstab_df['county_fips'] = cdc_counties_race_crosstab_df.county_fips.astype(int)
cdc_counties_race_crosstab_df[cases_field_prefix + 'known_cases'] = (
cdc_counties_race_crosstab_df[cases_field] -
cdc_counties_race_crosstab_df.na_cases.fillna(0) -
cdc_counties_race_crosstab_df.unknown_cases.fillna(0))
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
acs_name_lookup_df = pd.io.gbq.read_gbq(ACS_POPULATION_DATA_QUERY, project_id=PROJECT_ID)
acs_name_lookup_df['state_county'] = (acs_name_lookup_df.county.astype('string').str.strip() +
', ' + acs_name_lookup_df.state.astype('string').str.strip())
acs_name_lookup_df.drop(columns=['state', 'county'], inplace=True)
acs_name_lookup_df.set_index('county_fips', inplace=True)
cdc_counties_race_df = cdc_counties_race_crosstab_df.join(acs_name_lookup_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
cdc_counties_race_df.county_fips = cdc_counties_race_df.county_fips.astype(int)
cdc_counties_race_df.set_index('county_fips', inplace=True)
return cdc_counties_race_df
def CreateNCHSStateDataframe(table_name):
nchs_deaths_state_df = pd.io.gbq.read_gbq(
NCHS_STATES_QUERY_STR % table_name, project_id=PROJECT_ID)
nchs_deaths_state_df = nchs_deaths_state_df.replace(
to_replace={'State': STATES_TO_ABBREVIATIONS})
nchs_deaths_state_counts_df = pd.io.gbq.read_gbq(
NCHS_STATE_COUNTS_QUERY_STR % table_name, project_id=PROJECT_ID)
nchs_deaths_state_counts_df = nchs_deaths_state_counts_df.replace(
to_replace={'State': STATES_TO_ABBREVIATIONS})
nchs_deaths_state_df['cdc_known_cases_percent'] = round((
nchs_deaths_state_df.Non_Hispanic_White.fillna(0) +
nchs_deaths_state_df.Non_Hispanic_Black_or_African_American.fillna(0) +
nchs_deaths_state_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
nchs_deaths_state_df.Non_Hispanic_Asian.fillna(0) +
nchs_deaths_state_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
nchs_deaths_state_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
nchs_deaths_state_counts_df['cdc_known_cases'] = round(
nchs_deaths_state_counts_df.Non_Hispanic_White.fillna(0) +
nchs_deaths_state_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
nchs_deaths_state_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
nchs_deaths_state_counts_df.Non_Hispanic_Asian.fillna(0) +
nchs_deaths_state_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
nchs_deaths_state_counts_df.Hispanic_or_Latino.fillna(0))
nchs_deaths_state_df = nchs_deaths_state_df.set_index('State')
nchs_deaths_state_counts_df = nchs_deaths_state_counts_df.set_index('State')
nchs_deaths_state_df = nchs_deaths_state_df.join(
nchs_deaths_state_counts_df, on="State", how='inner', lsuffix='_left', rsuffix='_right')
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
nchs_deaths_state_df['cdc_cases'] = round(
nchs_deaths_state_df.cdc_known_cases / nchs_deaths_state_df.cdc_known_cases_percent, 0)
# Comnbine the case counts for NY and New York City, recalculate percentage, remove New York City.
nchs_deaths_state_df.loc['NY', 'cdc_cases'] = (
nchs_deaths_state_df.loc['NY', 'cdc_cases'] +
nchs_deaths_state_df.loc['New York City', 'cdc_cases'])
nchs_deaths_state_df.loc['NY', 'cdc_known_cases'] = (
nchs_deaths_state_df.loc['NY', 'cdc_known_cases'] +
nchs_deaths_state_df.loc['New York City', 'cdc_known_cases'])
nchs_deaths_state_df.loc['NY', 'cdc_known_cases_percent'] = round(
nchs_deaths_state_df.loc['NY', 'cdc_known_cases'] /
nchs_deaths_state_df.loc['NY', 'cdc_cases'], 4)
nchs_deaths_state_df.drop(['New York City'], inplace=True)
nchs_deaths_state_df.reset_index(inplace=True)
nchs_deaths_state_df.rename(columns={'State': 'state'}, inplace=True)
nchs_deaths_state_df = nchs_deaths_state_df[
['state', 'cdc_cases', 'cdc_known_cases', 'cdc_known_cases_percent']].copy()
nchs_deaths_state_df['state_fips_code'] = nchs_deaths_state_df.state
nchs_deaths_state_df = nchs_deaths_state_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
return nchs_deaths_state_df
def CreateNCHSCountyDataframe(query):
df_acs_name_lookup = pd.io.gbq.read_gbq(ACS_POPULATION_DATA_QUERY, project_id=PROJECT_ID)
df_acs_name_lookup.set_index('county_fips', inplace=True)
nchs_deaths_county_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
nchs_deaths_county_df['county_fips'] = nchs_deaths_county_df.FIPS_Code
nchs_deaths_county_df.set_index('county_fips', inplace=True)
nchs_deaths_county_df['state_county'] = nchs_deaths_county_df.County_Name + ', ' + nchs_deaths_county_df.State
nchs_deaths_county_df['total_known_cases'] = round((
nchs_deaths_county_df.Non_Hispanic_White.fillna(0) +
nchs_deaths_county_df.Non_Hispanic_Black.fillna(0) +
nchs_deaths_county_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
nchs_deaths_county_df.Non_Hispanic_Asian.fillna(0) +
nchs_deaths_county_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
nchs_deaths_county_df.Hispanic.fillna(0)) * nchs_deaths_county_df.COVID_19_Deaths, 0)
nchs_deaths_county_df = nchs_deaths_county_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nchs_deaths_county_df.reset_index(inplace=True)
nchs_deaths_county_df.county_fips = nchs_deaths_county_df.county_fips.astype(int)
nchs_deaths_county_df.set_index('county_fips', inplace=True)
return nchs_deaths_county_df
def CreateScatterPlot(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
axis_format = ',.0f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
axis_format = '.0%'
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
plot = alt.Chart(chart_df).mark_circle(size=60).encode(
alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
color=alt.Color(fields_dict['percent']['name'],
type='quantitative',
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True),
legend=alt.Legend(format=legend_format),
title=metric_type.capitalize()),
tooltip=tooltips,
).properties(
height=height,
width=width,
)
if metric_type == 'ratio':
plot.interactive()
line = pd.DataFrame({
'x': [0, scale_max],
'y': [0, scale_max],
})
if metric_type == 'ratio':
line_plot = alt.Chart(line).mark_line(color='black').encode(
x='x',
y='y',
)
elif metric_type == 'percent':
line_plot = (
alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
)
# Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
scatter = (plot + line_plot).properties(
title=title,
height=height,
width=width,
).interactive()
return scatter
def CreateMap(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
fips_code = 'state_fips_code'
topo_feature = US_STATES_TOPO
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
fips_code = 'county_fips'
topo_feature = US_COUNTIES_TOPO
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
field_names = [geo_field]
field_names.extend([fields_dict[field]['name'] for field in fields_dict])
plot = alt.Chart(topo_feature).mark_geoshape(
stroke='white',
strokeOpacity=.2,
strokeWidth=1
).project(
type='albersUsa'
).transform_lookup(
lookup='id',
from_=alt.LookupData(chart_df, fips_code, field_names)
).encode(
alt.Color(fields_dict['percent']['name'],
type='quantitative',
legend=alt.Legend(format=legend_format),
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True,
),
title=metric_type.capitalize()),
tooltip=tooltips
).add_selection(
highlight,
)
states_outline = alt.Chart(US_STATES_TOPO).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
type='albersUsa'
)
states_fill = alt.Chart(US_STATES_TOPO).mark_geoshape(
fill='silver',
stroke='white'
).project('albersUsa')
layered_map = alt.layer(states_fill, plot, states_outline).properties(
height=height,
width=width,
title=title,
)
return layered_map
def CreateScatterPlotAndMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
scatter = CreateScatterPlot(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
map = CreateMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
return (scatter | map).configure_view(
strokeWidth=0,
).configure_mark(
stroke='grey'
).configure_legend(
gradientLength=scatter_height - 50
)
def PrintSummaryStats(chart_df, field='percent'):
plus_minus_15_df = chart_df[chart_df[field] >= .85]
plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
plus_minus_50_df = chart_df[chart_df[field] >= .50]
plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
print('< than .50: ', len(chart_df[chart_df[field] < .5]))
print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
print(chart_df[field].describe())
The Covid Tracking Project was the most reliable source for COVID-19 data with race/ethnicity at the state level until it stopped collecting data on March 7, 2021. The CDC's Case Surveillance Restricted Access and National Center for Health Statistics provisional deaths datasets are the best available replacements for the Covid Tracking Project's dataset, and they additionally include county-level data and age along with race/ethnicity. This paper evaluates the completeness of the CDC datasets at the state and county levels in terms of (1) the total number of deaths included compared to the New York Times, and (2) the number of deaths included with race/ethnicity data compared to the Covid Tracking Project.
The CDC's Restricted Access dataset contains 79% of the deaths in the New York Times up to April 15, and 84% of deaths have race/ethnicity information vs. 93% in the Covid Tracking Project. At the state and county levels, the dataset's completeness is highly variable with 11 states reporting fewer than 10% of deaths and eight reporting 0% of the deaths included in the New York Times. The National Center for Health Statistics' dataset is highly complete in all states except for North Carolina. At the county level, the National Center for Health Statistics' dataset is more complete within the counties it contains, but it only contains counties with at least 100 COVID-19 deaths, which are generally counties with larger populations.
This analysis picks up where the case data completeness analysis left off to evaluate COVID-19 deaths data with race/ethnicity at the state and county levels.
While we only have three options for COVID-19 case data with race/ethnicity, we have five options for COVID-19 deaths data with race/ethnicity:
In the case data completeness analysis, we compared the CDC's Restricted Access dataset to the New York Times and Covid Racial Data Tracker. In this analysis, we'll compare the CDC's Restricted Access dataset and NCHS datasets to those data sources. We won't analyze the CDC's Public Use with Geography dataset because it is similar to the Restricted Access dataset but has more privacy suppression.
Our analysis shows that the CDC's Case Surveillance Restricted Access dataset has two main data completeness issues:
For the 373K deaths where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:
#@title
overall_df = pd.io.gbq.read_gbq(CDC_OVERALL_RACE_QUERY, project_id=PROJECT_ID)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
overall_df = overall_df.set_index('race_ethnicity_combined')
chart_denominator = 1000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
overall_df.cases['black_cases'] / chart_denominator,
overall_df.cases['white_cases'] / chart_denominator,
overall_df.cases['asian_cases'] / chart_denominator,
overall_df.cases['nhpi_cases'] / chart_denominator,
overall_df.cases['aian_cases'] / chart_denominator,
overall_df.cases.sum() / chart_denominator,
]
# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
60481746 / chart_denominator,
40596040 / chart_denominator,
196789401 / chart_denominator,
18427914 / chart_denominator,
565473 / chart_denominator,
2236348 / chart_denominator,
328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'*Total Including Unknowns*',
], 'percent': percent_list,
'cases': cases_list,
'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.2%'), scale=alt.Scale(domain=(0, .0029)), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='tableau20'),
title='',
legend=None),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
]
).properties(
title='Percent of Race/Ethnicity Group who died from COVID-19 based on incomplete CDC Data up to %s' % DATE_DISPLAY_NAME)
bars.display()
#alt.concat(bars).properties(
# title=alt.TitleParams(
# ['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
# baseline='bottom',
# dy=20,
# orient='bottom',
# fontWeight='normal',
# fontSize=11
# )
#).display()
The chart above is based on incomplete data. With only 79% of deaths included, the total percent of people who died from COVID-19 should be 0.17% instead of 0.14%. It's harder to estimate how much the race/ethnicity groups are undercounting the number of confirmed COVID-19 deaths. If we added all 73K deaths with missing race/ethnicity to the Hispanic/Latino group, the percent of Hispanic/Latinos in the U.S. who died from COVID-19 would go from 0.11% to 0.23% — a 2x increase. If all 73K deaths with missing race/ethnicity were Black people, the percent of Black people who died from COVID-19 would go from 0.12% to 0.30% — a 2.5x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.
We can get a more complete view of the deaths within each race/ethnicity group from the NCHS State dataset, which contains 97% of the deaths in the New York Times up to April 14, where 99% of those deaths have race/ethnicity information.
#@title
overall_provisional_df = pd.io.gbq.read_gbq(NCHS_STATES_OVERALL_QUERY, project_id=PROJECT_ID)
overall_provisional_counts_df = pd.io.gbq.read_gbq(NCHS_STATE_COUNTS_OVERALL_QUERY, project_id=PROJECT_ID)
overall_provisional_df['cdc_known_cases_percent'] = round((
overall_provisional_df.Non_Hispanic_White.fillna(0) +
overall_provisional_df.Non_Hispanic_Black_or_African_American.fillna(0) +
overall_provisional_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
overall_provisional_df.Non_Hispanic_Asian.fillna(0) +
overall_provisional_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
overall_provisional_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
overall_provisional_counts_df['cdc_known_cases'] = round(
overall_provisional_counts_df.Non_Hispanic_White.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_Asian.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
overall_provisional_counts_df.Hispanic_or_Latino.fillna(0))
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
total_cases_including_unknowns = round(
overall_provisional_counts_df.cdc_known_cases[0] / overall_provisional_df.cdc_known_cases_percent[0], 0)
chart_denominator = 1000
cases_list = [overall_provisional_counts_df.Hispanic_or_Latino[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_Black_or_African_American[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_White[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_Asian[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native[0] / chart_denominator,
total_cases_including_unknowns / chart_denominator,
]
# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
60481746 / chart_denominator,
40596040 / chart_denominator,
196789401 / chart_denominator,
18427914 / chart_denominator,
565473 / chart_denominator,
2236348 / chart_denominator,
328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'*Total Including Unknowns*',
], 'percent': percent_list,
'cases': cases_list,
'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.2%'), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='tableau20'),
title='',
legend=None),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
]
).properties(
title='Percent of Race/Ethnicity Group who died from COVID-19 based on more complete NCHS Data up to %s' % NCHS_DATE_DISPLAY_NAME
)
bars.display()
We can see that all of the percentages are larger due to having more complete death counts and more deaths with known race/ethnicity. In these results, 0.17% of Hispanics/Latinos and 0.20% of Black people have died from COVID-19, which is 1.5x and 1.6x the results from the CDC's Restricted Access dataset above, respectively. American Indians/Alaska Natives have the highest death rate of 0.28% — 2.2x the rate from the CDC's Restricted Access dataset.
The goal of this analysis is to assess the completeness of the CDC's Restricted Access and NCHS datasets, which will help evaluate their usefulness in examining disparities in race/ethnicity for COVID-19 deaths at the state and county levels.
The overall data completeness findings for the CDC's Restricted Access Dataset are:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
FieldAnalysis(PROJECT_ID, CDC_TABLE, field_list, 'Most Complete Fields in the CDC Restricted Access Dataset').display()
The CDC's NCHS State and NCHS County datasets come from death certificates and are more complete than the CDC's Restricted Access dataset. The NCHS State dataset contains 97% of the total deaths in the NYT and 99% of the deaths have race/ethnicity information. The NCHS also provides a high-level overview of these data on a U.S.- and state-level dashboard.
We can evaluate the overall completeness of the CDC and NCHS datasets by calculating at (1) the percent of total death counts compared to the NYT, (2) the percent of deaths with race/ethnicity, and then (3) combine those two percentages into a composite that represents the percentage of total expected deaths that have race/ethnicity. Later on, we will do this same analysis at the state and county levels.
#@title
# Manually update these fields based on chart above, latest CDC data,
# and improving state/county data below.
overall_row_names = [
'Update frequency',
'Latest death date',
'Deaths in dataset as of date',
'Deaths in NYT as of date',
'(as a % of NYT)',
'Deaths with race/ethnicity',
'(as a % of deaths in dataset)',
'(composite % of NYT total with race/ethnicity)',
]
overall_crdt_metadata = [
'Stopped',
'Mar 7, 2021',
'525K',
'525K',
'(100%)',
'490K',
'(93%)',
'(93%)',
]
overall_cdc_metadata = [
'Every two weeks',
'Apr 15, 2021',
'446K',
'565K',
'(79%)',
'373K',
'(84%)',
'(66%)',
]
overall_nchs_state_metadata = [
'Weekly',
'Apr 14, 2021',
'546K',
'564K',
'(97%)',
'542K',
'(99%)',
'(96%)',
]
overall_nchs_county_metadata = [
'Weekly',
'Apr 14, 2021',
'490K',
'564K',
'(87%)',
'482K',
'(98%)',
'(85%)',
]
table_data = {'CRDT': overall_crdt_metadata,
'CDC Restricted': overall_cdc_metadata,
'NCHS State': overall_nchs_state_metadata,
'NCHS County': overall_nchs_county_metadata,
}
metadata_df = pd.DataFrame(table_data, index=overall_row_names)
metadata_df.head(15)
The NCHS dataset offers an excellent alternative to the CDC's Restricted Access dataset at the state level. At the county level, the NCHS dataset only includes counties with 100 or more deaths, so it contains fewer counties than the Restricted Access dataset, but those counties account for 81% of the U.S. population. We will discuss the tradeoffs at the county level in more detail later on in this analysis.
What we didn't include in this report:
To get a baseline of how much we could expect the CDC death counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the CRDT and NYT death counts are equal.
#@title
crdt_states_df = pd.io.gbq.read_gbq(CRDT_QUERY, project_id=PROJECT_ID)
crdt_states_df.set_index('state', inplace=True)
nyt_states_compare_crdt_df = CreateNYTStateDataframe(NYT_STATES_COMPARE_CRDT_QUERY)
crdt_states_df.reset_index(inplace=True)
crdt_states_df['state_fips_code'] = crdt_states_df.state
crdt_states_df = crdt_states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_states_df.set_index('state_fips_code', inplace=True)
nyt_crdt_counts_df = nyt_states_compare_crdt_df.join(crdt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
nyt_crdt_counts_df['percent'] = round(nyt_crdt_counts_df.crdt_cases / nyt_crdt_counts_df.nyt_cases, 2)
nyt_crdt_counts_df.reset_index(inplace=True)
#@title
nyt_crdt_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CRDT to NYT'},
}
nyt_crdt_title = 'Ratio of CRDT to NYT Deaths by State up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
nyt_crdt_counts_df, nyt_crdt_fields_dict, nyt_crdt_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(nyt_crdt_counts_df)
We can see below that the CDC death counts differ from the NYT death counts more drastically than the CRDT did with only 29 states within a +/-0.15 ratio of the NYT counts. Note: In the analysis and charts below, we'll refer to the CDC Restricted Access dataset as the "CDC" dataset.
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY)
cdc_nyt_states_df = nyt_states_df.join(cdc_states_df, on="state_fips_code", how='left', lsuffix='', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
# Fix the states that are missing from the CDC data.
cdc_nyt_states_df.fillna(0, inplace=True)
cdc_nyt_states_df.state = cdc_nyt_states_df.state_name
cdc_nyt_states_df = cdc_nyt_states_df.replace(
to_replace={'state': STATES_TO_ABBREVIATIONS})
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
#@title
cdc_nyt_state_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_state_title = 'Ratio of CDC to NYT Deaths by State up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_nyt_states_df)
#cdc_nyt_states_df.sort_values(by='percent')
There are many outlier states that are too far off from the NYT death counts to be explained by a time lag:
We can do the same analysis at the county level. The CDC Restricted Access dataset contains 2,419 counties in the 50 states + D.C., which is 77% of all counties and accounts for 87% of the population.
Each dot is a county (hover to see details). We show all 2,413 counties in the CDC data that were also in the NYT data on the left and zoom in on the smaller counties on the right. Note that the five counties in New York City and one borough in Alaska are missing because the NYT combined them into one region for New York City and a combined region in Alaska; see the case data completeness report's Appendix for more details.
#@title
# CDC vs. NYT county
cdc_counties_race_df = CreateCDCCountyRaceDataframe(CDC_COUNTIES_RACE_QUERY, 'cdc_')
nyt_counties_df = CreateNYTCountyDataframe(NYT_COUNTIES_QUERY)
cdc_nyt_counties_race_df = cdc_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_nyt_counties_race_df = cdc_nyt_counties_race_df.reset_index()
cdc_nyt_counties_race_df['percent'] = round(cdc_nyt_counties_race_df.cdc_cases / cdc_nyt_counties_race_df.nyt_cases, 2)
cdc_nyt_counties_race_df['cdc_known_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_cases /
cdc_nyt_counties_race_df.cdc_cases, 2)
cdc_nyt_counties_race_df['cdc_known_or_na_cases'] = (cdc_nyt_counties_race_df.cdc_known_cases.fillna(0) +
cdc_nyt_counties_race_df.na_cases.fillna(0))
cdc_nyt_counties_race_df['cdc_known_or_na_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_or_na_cases /
cdc_nyt_counties_race_df.cdc_cases, 2)
#PrintSummaryStats(cdc_nyt_counties_race_df)
# These values are used in the num_counties table.
#print(len(cdc_counties_race_df))
#print(len(cdc_counties_race_df) / 3143)
#print(cdc_counties_race_df.total_pop.sum())
#print(cdc_counties_race_df.total_pop.sum() / 324697795) # Population covered in these counties
#print(cdc_counties_race_df.cdc_known_cases.sum())
#print(0.55 * 324697795) # NYT population
#@title
cdc_nyt_county_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_county_title = 'Ratio of CDC to NYT Deaths by County up to %s' % DATE_DISPLAY_NAME
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'
cdc_nyt_county_scatter = CreateScatterPlot(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
cdc_nyt_county_zoom_scatter = CreateScatterPlot(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, zoom_cdc_nyt_title, COUNTY_CASES_ZOOM_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
(cdc_nyt_county_scatter | cdc_nyt_county_zoom_scatter).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df)
We can also view these ratios on the map on the right and compare them to the state-level totals map on the left.
#@title
cdc_nyt_states_title = 'Ratio of CDC to NYT Deaths by County up to %s' % DATE_DISPLAY_NAME
cdc_nyt_county_map = CreateMap(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'ratio'
)
cdc_nyt_state_map = CreateMap(
cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_states_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'ratio'
)
(cdc_nyt_state_map | cdc_nyt_county_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
Notes:
How much race/ethnicity information is available in the CDC data at the state and county levels?
#@title
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_')
cdc_states_race_df.reset_index(inplace=True)
cdc_race_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Deaths with race/ethnicity'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent deaths with race/ethnicity'},
}
cdc_states_race_title = 'CDC Percent of Deaths with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_states_race_map = CreateMap(
cdc_states_race_df, cdc_race_fields_dict, cdc_states_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
cdc_counties_race_title = 'CDC Percent of Deaths with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_counties_race_map = CreateMap(
cdc_nyt_counties_race_df, cdc_race_fields_dict, cdc_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'percent'
)
(cdc_states_race_map | cdc_counties_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_states_race_df, 'cdc_known_cases_percent')
#PrintSummaryStats(cdc_nyt_counties_race_df, 'cdc_known_cases_percent')
Note:
The average percent of deaths with race/ethnicity at the state level is 71%. Only 16 states have 85% or more deaths with race/ethnicity, and 39 states have 50% or more deaths with race/ethnicity. The average percent of deaths with race/ethnicity at the county level is 73%. Only 1,119 counties have 85% or more deaths with race ethnicity, and 2,061 counties have 50% or more deaths with race/ethnicity.
How does the CDC dataset compare to the CRDT dataset, which is the most up-to-date aggregate dataset for race/ethnicity at the state level up to March 7, 2021? Overall, 93% of the deaths in the CRDT data have race information and 91% have ethnicity information. In the CDC data up to March 7, 84% of deaths have race/ethnicity combined information.
We will use the race field in the CRDT data as a proxy for a combined race/ethnicity field. For some states, the race field is a combined race/ethnicity field that exactly matches how the CDC Restricted Access dataset reports race/ethnicity. However, the CRDT also captures the many non-standard ways that states report race/ethnicity, as described in this Covid Racial Data Tracker analysis. There's no way to do an exact comparison between the standardized race/ethnicity category in the CDC data and the many ways that race and ethnicity are reported in the CRDT. We use the race field in the CRDT dataset because it's a closer approximation of the combined race/ethnicity field than the ethnicity field.
#@title
cdc_up_to_crdt_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_UP_TO_CRDT_QUERY, 'cdc_')
crdt_df = CreateCRDTStateRaceDataframe(CRDT_QUERY)
cdc_crdt_race_df = crdt_df.join(cdc_up_to_crdt_race_df, on="state_fips_code", how='left', lsuffix='', rsuffix='_right')
cdc_crdt_race_df.reset_index(inplace=True)
cdc_crdt_race_df['percent'] = round(cdc_crdt_race_df.cdc_known_cases / cdc_crdt_race_df.crdt_known_race_cases, 4)
#@title
crdt_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Deaths with race/ethnicity'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent deaths with race/ethnicity'},
}
crdt_race_title = 'CRDT Percent of Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
crdt_race_map = CreateMap(
cdc_crdt_race_df, crdt_race_fields_dict, crdt_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH - 5, 'state', 'percent'
)
cdc_states_race_up_to_crdt_title = 'CDC Percent of Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
cdc_states_race_up_to_crdt_map = CreateMap(
cdc_crdt_race_df, cdc_race_fields_dict, cdc_states_race_up_to_crdt_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
(cdc_states_race_up_to_crdt_map | crdt_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_crdt_race_df, 'crdt_known_race_cases_percent')
Note that we only looked at CDC deaths up to March 7 when comparing against the CRDT.
We can also compare the number of deaths within each state that has known race/ethnicity instead of the percent of deaths.
#@title
# Handle case of states with no deaths in CDC data.
cdc_crdt_race_df.fillna(0, inplace=True)
cdc_crdt_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT deaths with race/ethnicity'},
'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC deaths with race/ethnicity'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_race_title = 'Ratio of CDC to CRDT Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_crdt_race_df, cdc_crdt_race_fields_dict, cdc_crdt_race_title, CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_crdt_race_df[cdc_crdt_race_df.state != 'ND'])
Notes:
At the state level, the CDC dataset has an average ratio of 0.67 deaths with race/ethnicity vs. the CRDT.
The CDC has an alternative public source for death data with race/ethnicity at the state and county levels that comes from death certificates via the NCHS. The table below summarizes the completeness of the CDC's Restricted Access dataset vs. the NCHS State and County datasets.
#@title
# Manually update these fields based on the latest CDC data.
num_counties_row_names = [
'(% of NYT deaths)',
'(% of deaths with race/ethnicity)',
'(composite % of NYT total with race/ethnicity)',
'Number of counties',
'(as a % of all counties)',
'Population in those counties',
'(as a % of total U.S population – States + D.C.)',
]
num_counties_cdc_metadata = [
'(79%)',
'(84%)',
'(66%)',
'2,419',
'(77%)',
'281M',
'(87%)',
]
num_counties_nchs_state_metadata = [
'(97%)',
'(99%)',
'(96%)',
'-',
'-',
'-',
'-',
]
num_counties_nchs_county_metadata = [
'(87%)',
'(98%)',
'(85%)',
'758',
'(24%)',
'264M',
'(81%)',
]
table_data = {'CDC Restricted': num_counties_cdc_metadata,
'NCHS State': num_counties_nchs_state_metadata,
'NCHS County': num_counties_nchs_county_metadata,
}
metadata_df = pd.DataFrame(table_data, index=num_counties_row_names)
metadata_df.head(15)
The NCHS datasets are more complete than the CDC's Restricted Access dataset in every way except that the NCHS County dataset contains fewer counties because it only includes counties with at least 100 deaths. The American Public Media Research Lab also found that the NCHS State dataset was more complete than public health website data for 16 states.
The death counts in the NCHS datasets are fairly close to those in the NYT dataset with 45 states within a +/-0.15 ratio of the NYT counts.
#@title
nchs_deaths_state_df = CreateNCHSStateDataframe(NCHS_STATES_TABLE)
nyt_states_nchs_df = CreateNYTStateDataframe(NYT_STATES_NCHS_QUERY)
nchs_nyt_states_df = nchs_deaths_state_df.join(nyt_states_nchs_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
nchs_nyt_states_df.reset_index(inplace=True)
nchs_nyt_states_df['percent'] = round(nchs_nyt_states_df.cdc_cases / nchs_nyt_states_df.nyt_cases, 4)
nchs_deaths_counties_df = CreateNCHSCountyDataframe(NCHS_COUNTIES_QUERY)
nyt_counties_nchs_df = CreateNYTCountyDataframe(NYT_COUNTIES_NCHS_QUERY)
nchs_nyt_counties_df = nchs_deaths_counties_df.join(nyt_counties_nchs_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
nchs_nyt_counties_df = nchs_nyt_counties_df.reset_index()
nchs_nyt_counties_df['percent'] = round(nchs_nyt_counties_df.COVID_19_Deaths / nchs_nyt_counties_df.nyt_cases, 4)
nchs_nyt_counties_df['cdc_known_cases_percent'] = round(nchs_nyt_counties_df.total_known_cases / nchs_nyt_counties_df.COVID_19_Deaths, 4)
#print('county file deaths: ', nchs_nyt_counties_df.COVID_19_Deaths.sum())
#print('county file known race/ethnicity: ', nchs_nyt_counties_df.total_known_cases.sum())
#print('county file % known race/ethnicity: ', nchs_nyt_counties_df.total_known_cases.sum() / nchs_nyt_counties_df.COVID_19_Deaths.sum())
#print('total counties: ', len(nchs_nyt_counties_df))
#print('as % of counties: ', len(nchs_nyt_counties_df) / 3143)
#print('population in counties: ', nchs_nyt_counties_df.total_pop.sum())
#print('as % of total population: ', nchs_nyt_counties_df.total_pop.sum() / 324697795) # Population covered in these counties
#print('state file deaths: ', nchs_nyt_states_df.cdc_cases.sum())
#print('state file known race/ethnicity: ', nchs_nyt_states_df.cdc_known_cases.sum())
#print('state file % known race/ethnicity: ', nchs_nyt_states_df.cdc_known_cases.sum() / nchs_nyt_states_df.cdc_cases.sum())
#@title
nchs_nyt_state_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'NCHS deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NCHS to NYT'},
}
nchs_nyt_state_title = 'Ratio of NCHS to NYT Deaths up to %s' % NCHS_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
nchs_nyt_states_df, nchs_nyt_state_fields_dict, nchs_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(nchs_nyt_states_df)
When we compare death counts at the county level, we can see that the NCHS County dataset only contains a subset of U.S. counties (758 or 24% of all counties). We can also see that these counties are generally those with larger populations (counties with at least 100 deaths).
#@title
nchs_nyt_county_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'NCHS deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NCHS to NYT'},
}
nchs_nyt_county_title = 'Ratio of NCHS to NYT Deaths up to %s' % NCHS_DATE_DISPLAY_NAME
nchs_nyt_county_map = CreateScatterPlotAndMap(
nchs_nyt_counties_df, nchs_nyt_county_fields_dict, nchs_nyt_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'ratio'
)
(nchs_nyt_county_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(nchs_nyt_counties_df)
In the charts and maps below, we can see that the NCHS datasets have a strikingly high percentage of deaths with known race/ethnicity.
#@title
nchs_known_state_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'NCHS deaths'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}
nchs_known_state_title = 'NCHS Deaths with Known Race/Ethnicity up to %s' % NCHS_DATE_DISPLAY_NAME
nchs_known_state_map = CreateMap(
nchs_nyt_states_df, nchs_known_state_fields_dict, nchs_known_state_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH - 5, 'state', 'percent'
)
nchs_known_county_fields_dict = {
'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'NCHS deaths'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}
nchs_known_county_title = 'NCHS Deaths with Known Race/Ethnicity up to %s' % NCHS_DATE_DISPLAY_NAME
nchs_known_county_map = CreateMap(
nchs_nyt_counties_df, nchs_known_county_fields_dict, nchs_known_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH - 5, 'county', 'percent'
)
(nchs_known_state_map | nchs_known_county_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(nchs_nyt_states_df, field='cdc_known_cases_percent')
#PrintSummaryStats(nchs_nyt_counties_df, field='cdc_known_cases_percent')
At the state level, the average percent of deaths with race/ethnicity is 99% vs. 97% at the county level. The minimum percentage of deaths with race/ethnicity at the state level is 89% in Hawaii and 81% at the county level.
Notes:
We can see below that the NCHS State dataset has more consistently high percentages of race/ethnicity than the CRDT by comparing the NCHS data up to March 3, which was the closest release date of the NCHS data to the Covid Tracking Project's last data collection on March 7.
#@title
nchs_up_to_crdt_race_df = CreateNCHSStateDataframe(NCHS_STATES_CRDT_COMPARE_TABLE)
nchs_crdt_df = CreateCRDTStateRaceDataframe(CRDT_NCHS_QUERY)
nchs_crdt_race_df = nchs_up_to_crdt_race_df.join(nchs_crdt_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
nchs_crdt_race_df.reset_index(inplace=True)
nchs_crdt_race_df['percent'] = round(nchs_crdt_race_df.cdc_known_cases / nchs_crdt_race_df.crdt_known_race_cases, 4)
#@title
nchs_race_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Deaths with race/ethnicity'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'NCHS deaths'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent deaths with race/ethnicity'},
}
nchs_crdt_race_title = 'CRDT Percent Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_NCHS_DATE_DISPLAY_NAME
nchs_crdt_race_map = CreateMap(
nchs_crdt_race_df, crdt_race_fields_dict, nchs_crdt_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH - 5, 'state', 'percent'
)
nchs_states_race_up_to_crdt_title = 'NCHS Percent of Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_NCHS_DATE_DISPLAY_NAME
nchs_states_race_up_to_crdt_map = CreateMap(
nchs_crdt_race_df, nchs_race_fields_dict, nchs_states_race_up_to_crdt_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
(nchs_states_race_up_to_crdt_map | nchs_crdt_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(nchs_crdt_race_df, 'crdt_known_race_cases_percent')
We can compare these percentages more directly by taking the ratio of the count of deaths with race/ethnicity in each dataset at the state level.
#@title
nchs_crdt_ratio_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT deaths with race/ethnicity'},
'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'NCHS deaths with race/ethnicity'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NCHS to CRDT'},
}
nchs_crdt_ratio_race_title = 'Ratio of NCHS to CRDT Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_NCHS_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
nchs_crdt_race_df, nchs_crdt_ratio_race_fields_dict, nchs_crdt_ratio_race_title, CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'state', 'ratio'
).display()
#PrintSummaryStats(nchs_crdt_race_df[nchs_crdt_race_df.state != 'ND'])
Notes:
The ratio of NCHS deaths with race/ethnicity to CRDT deaths with race/ethnicity is between +/-0.15 for 38 states and +/-0.50 for 47 states.
How can we use these datasets at the state and county levels?
When using these datasets, we recommend being transparent about the level of completeness or lack thereof in (1) the total death counts, and (2) the percentage of deaths with race/ethnicity. Just like we created a composite measure of the overall datasets along these lines, we do the same at the state and county levels in the charts below. You can think of these as "nutrition facts" labels for each state and county's data.
We also recommend looking at the race/ethnicity breakdowns within each state or county to see if race/ethnicity information is missing disproportionately from one group. If states or counties' data are too incomplete to draw conclusions from, you may want to exclude them entirely from your analyses. You may also find it useful to include unknowns to highlight the incompleteness in the data.
The scatterplots below can help show the completeness issues in each state and county. The scatterplots show (1) death counts as a percentage of the NYT total death counts on the y-axis, and (2) the percentage of deaths with known race/ethnicity on the x-axis. The colors of the dots and states on the map show the composite completeness measure by multiplying those two numbers together, which is the composite percentage of total expected deaths that have race/ethnicity in the datasets (blue is more complete and red is less complete).
#@title
cdc_crdt_race_df = cdc_up_to_crdt_race_df.join(crdt_df, on="state_fips_code", how='right', lsuffix='_left', rsuffix='_right')
cdc_crdt_race_df.reset_index(inplace=True)
nyt_crdt_counts_df.set_index('state_fips_code', inplace=True)
cdc_crdt_race_df.set_index('state_fips_code', inplace=True)
cdc_crdt_race_df_composite = nyt_crdt_counts_df.join(cdc_crdt_race_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
nyt_crdt_counts_df.reset_index(inplace=True)
cdc_crdt_race_df_composite.reset_index(inplace=True)
cdc_crdt_race_df_composite['percent'] = round(cdc_crdt_race_df_composite.crdt_cases / cdc_crdt_race_df_composite.nyt_cases, 2)
cdc_crdt_race_df_composite['cases_max_100_percent'] = cdc_crdt_race_df_composite.percent.clip(upper=1)
cdc_crdt_race_df_composite['composite_percent'] = cdc_crdt_race_df_composite.cases_max_100_percent * cdc_crdt_race_df_composite.crdt_known_race_cases_percent
crdt_composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CRDT percent of NYT total deaths'},
'x': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'CRDT percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CRDT percent of NYT total with race/ethnicity'},
}
crdt_composite_title = 'CRDT Percent of NYT Deaths with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_crdt_race_df_composite, crdt_composite_fields_dict, crdt_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_crdt_race_df_composite, 'composite_percent')
#@title
cdc_nyt_states_race_df = cdc_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_race_df.reset_index(inplace=True)
cdc_nyt_states_race_df['percent'] = round(cdc_nyt_states_race_df.cdc_cases / cdc_nyt_states_race_df.nyt_cases, 2)
cdc_nyt_states_race_df['cases_max_100_percent'] = cdc_nyt_states_race_df.percent.clip(upper=1)
cdc_nyt_states_race_df['composite_percent'] = cdc_nyt_states_race_df.cases_max_100_percent * cdc_nyt_states_race_df.cdc_known_cases_percent
composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC percent of NYT total with race/ethnicity'},
}
composite_title = 'CDC Percent of NYT Deaths with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_nyt_states_race_df, composite_fields_dict, composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
# Mind the missing states for percentages.
#PrintSummaryStats(cdc_nyt_states_race_df, 'composite_percent')
#@title
nchs_nyt_states_df['cases_max_100_percent'] = nchs_nyt_states_df.percent.clip(upper=1)
nchs_nyt_states_df['composite_percent'] = nchs_nyt_states_df.cases_max_100_percent * nchs_nyt_states_df.cdc_known_cases_percent
nchs_composite_title = 'NHCS Percent of NYT Deaths with Race/Ethnicity up to %s' % NCHS_DATE_DISPLAY_NAME
nchs_composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'NCHS percent of NYT total deaths'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'NCHS percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: NCHS percent of NYT total with race/ethnicity'},
}
state_composite_map = CreateScatterPlotAndMap(
nchs_nyt_states_df, nchs_composite_fields_dict, nchs_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(nchs_nyt_states_df, 'composite_percent')
Notes:
To compare these three datasets, we can look at how many states have a composite completeness measure of at least 50% and at least 85%.
#@title
# Manually update these fields based on the latest CDC data.
states_row_names = [
'Number of states with composite >= 50%',
'(as a percent of all states + D.C.)',
'Number of states with composite >= 85%',
'(as a percent of all states + D.C.)',
]
states_crdt_metadata = [
'50',
'(98%)',
'47',
'(92%)',
]
states_cdc_metadata = [
'33',
'(65%)',
'14',
'(27%)',
]
states_nchs_metadata = [
'51',
'(100%)',
'47',
'(92%)',
]
table_data = {
'CRDT': states_crdt_metadata,
'CDC Restricted': states_cdc_metadata,
'NCHS State': states_nchs_metadata,
}
metadata_df = pd.DataFrame(table_data, index=states_row_names)
metadata_df.head(15)
If we require that states or counties have 50% of the total expected deaths with race/ethnicity, we can use 50 states from CRDT, 33 states from CDC Restricted, and 51 states from NCHS State. If we tighten that requirement to 85% of total expected deaths with race/ethnicity, we can use 47 states from CRDT, 14 states from CDC Restricted, and 47 states from NCHS State.
We can look at the same scatterplots and maps at the county level for the NCHS County and CDC datasets.
#@title
cdc_nyt_counties_race_df['cases_max_100_percent'] = cdc_nyt_counties_race_df.percent.clip(upper=1)
cdc_nyt_counties_race_df['composite_percent'] = cdc_nyt_counties_race_df.cases_max_100_percent * cdc_nyt_counties_race_df.cdc_known_cases_percent
county_composite_map = CreateScatterPlotAndMap(
cdc_nyt_counties_race_df, composite_fields_dict, composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'percent'
)
county_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df, field='composite_percent')
#greater_than_85_df = cdc_nyt_counties_race_df[cdc_nyt_counties_race_df['composite_percent'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = cdc_nyt_counties_race_df[cdc_nyt_counties_race_df['composite_percent'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#print('total counties: 3143')
#@title
nchs_nyt_counties_df['cases_max_100_percent'] = nchs_nyt_counties_df.percent.clip(upper=1)
nchs_nyt_counties_df['composite_percent'] = nchs_nyt_counties_df.cases_max_100_percent * nchs_nyt_counties_df.cdc_known_cases_percent
county_composite_map = CreateScatterPlotAndMap(
nchs_nyt_counties_df, nchs_composite_fields_dict, nchs_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'percent'
)
county_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(nchs_nyt_counties_df, field='composite_percent')
#greater_than_85_df = nchs_nyt_counties_df[nchs_nyt_counties_df['composite_percent'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nchs_nyt_counties_df[nchs_nyt_counties_df['composite_percent'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#print('total counties: 3143')
Notes:
#@title
# Manually update these fields based on the latest CDC data.
counties_row_names = [
'Number of counties with composite >= 50%',
'(as a percent of all counties)',
'Number of counties with composite >= 85%',
'(as a percent of all counties)',
'Population in counties with composite >= 50%',
'(as a % of total U.S population – States + D.C.)',
'Population in counties with composite >= 85%',
'(as a % of total U.S population – States + D.C.)',
]
counties_crdt_metadata = [
'-',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
]
counties_cdc_metadata = [
'1,639',
'(52%)',
'800',
'(25%)',
'210M',
'(64%)',
'146M',
'(45%)',
]
counties_nchs_metadata = [
'737',
'(23%)',
'549',
'(17%)',
'253M',
'(77%)',
'208M',
'(63%)',
]
table_data = {
'CRDT': counties_crdt_metadata,
'CDC Restricted': counties_cdc_metadata,
'NCHS County': counties_nchs_metadata,
}
metadata_df = pd.DataFrame(table_data, index=counties_row_names)
metadata_df.head(15)
If we require that states or counties have 50% of the total expected deaths with race/ethnicity, we can use 1,639 counties from the CDC Restricted dataset that account for 64% of the population or 737 counties from the NCHS County dataset that account for 77% of the U.S. population. If we tighten that requirement to 85% of total expected deaths with race/ethnicity, we can use 800 counties from the CDC Restricted dataset that account for 45% of the U.S. population or 549 counties from the NCHS County dataset that account for 63% of the U.S. population.
The NCHS datasets are more complete than the CDC Restricted Access dataset in almost every way: 98%-99% of the deaths reported have race/ethnicity, they contain 87%-97% of death counts in the NYT, and they are updated once a week rather than once every two weeks.
However, there are a few tradeoffs with using the NCHS State dataset:
There are a few additional tradeoffs with using the NCHS County dataset:
We didn't show data for territories in the maps above due to limitations in the map visualizations. Here are the details on data completeness for the territories included in the CDC Restricted Access Dataset. Note that the dataset didn't include any cases in American Samoa.
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY, include_territories=True)
cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
cdc_nyt_states_df.set_index('state', inplace=True)
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_', include_territories=True)
cdc_states_race_df.set_index('state', inplace=True)
row_names = [
'CDC deaths',
'NYT deaths',
'(CDC as a % of NYT deaths)',
'Deaths with race/ethnicity',
'(% with race/ethnicity)',
'(composite % of NYT total with race/ethnicity)'
]
cdc_cases = [cdc_nyt_states_df.loc[territory].cdc_cases for territory in TERRITORIES[:-1]]
cdc_cases = [float(i) for i in cdc_cases]
nyt_cases = [cdc_nyt_states_df.loc[territory].nyt_cases for territory in TERRITORIES[:-1]]
nyt_cases = [float(i) for i in nyt_cases]
nyt_percent = [str(round(cdc_nyt_states_df.loc[territory].percent * 100)) + '%' for territory in TERRITORIES[:-1]]
race_ethnicity_cases = [cdc_states_race_df.loc[territory].cdc_known_cases for territory in TERRITORIES[:-1]]
race_ethnicity_percent = [str(round(cdc_states_race_df.loc[territory].cdc_known_cases_percent * 100)) + '%' for territory in TERRITORIES[:-1]]
composite_percent = [str(round(cdc_nyt_states_df.loc[territory].percent * cdc_states_race_df.loc[territory].cdc_known_cases_percent * 100)) + '%' for territory in TERRITORIES[:-1]]
territories_data = list(zip(cdc_cases,
nyt_cases,
nyt_percent,
race_ethnicity_cases,
race_ethnicity_percent,
composite_percent))
table_data = {'Puerto Rico': territories_data[0], 'Guam': territories_data[1], 'Virgin Islands': territories_data[2], 'Northern Mariana Islands': territories_data[3]}
territories_df = pd.DataFrame(table_data, index=row_names)
pd.options.display.float_format = '{:,.0f}'.format
territories_df.head(n=10)
Here is the full comparison between the CRDT, CDC Restricted, NCHS State, and NCHS County datasets. Parts of this table appear in the paper above; this consolidates them into all one place for a full overview.
#@title
table_data = {'CRDT': overall_crdt_metadata + (len(num_counties_cdc_metadata) - 3) * ['-'] + states_crdt_metadata + counties_crdt_metadata,
'CDC Restricted': overall_cdc_metadata + num_counties_cdc_metadata[3:] + states_cdc_metadata + counties_cdc_metadata,
'NCHS State': overall_nchs_state_metadata + num_counties_nchs_state_metadata[3:] + states_nchs_metadata + len(counties_nchs_metadata) * ['-'],
'NCHS County': overall_nchs_county_metadata + num_counties_nchs_county_metadata[3:] + len(states_nchs_metadata) * ['-'] + counties_nchs_metadata
}
metadata_df = pd.DataFrame(table_data, index=overall_row_names + num_counties_row_names[3:] + states_row_names + counties_row_names)
metadata_df.head(25)
To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.
#@title
CreateMap(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'ratio'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
cdc_nyt_counties_race_df, cdc_race_fields_dict, cdc_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
nchs_nyt_counties_df, nchs_nyt_county_fields_dict, nchs_nyt_county_title, COUNTY_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'ratio'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
nchs_nyt_counties_df, nchs_known_county_fields_dict, nchs_known_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
cdc_nyt_counties_race_df, composite_fields_dict, composite_title, 1, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#@title
CreateMap(
nchs_nyt_counties_df, nchs_composite_fields_dict, nchs_composite_title, 1, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
The NCHS state dataset had separate rows for New York State and New York City; we combined them into New York State.
Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.
#@title
#%%shell
#jupyter nbconvert --to html 'cdc_death_data.ipynb'