May 24, 2021
Katrina Sostek
Work done as part of a Google.org Fellowship with the Satcher Health Leadership Institute at the Morehouse School of Medicine
#@title
# How to update the data:
# 1. You may need to copy this colab so you have your own version.
# 2. Update the history constants below to have the latest data's suffix.
# 3. Update the date history constants below to be the last case date included in the data.
# 4. Updates the scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
# Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
# the paper including text and tables, a lot of that is done in commented out PrintSummaryStats() statements.
import pandas as pd
import altair as alt
from vega_datasets import data
from google.colab import auth
auth.authenticate_user()
# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
pd.options.display.float_format = '{:,.2f}'.format
# Table name suffixes.
DATE_HISTORY = [
'20200529',
'20200627',
'20200731',
'20200831',
'20200930',
'20201031',
'20201204',
'20201231',
'20210131',
'20210228',
'20210331',
'20210430',
# '20210524',
# '20210621',
# '20210719',
# '20210831',
# '20211012',
# '20211025',
# '20211130',
# '20211221',
# '20220124',
]
# Cases up to these dates for querying NYT case counts.
DATE_UP_TO_HISTORY = [
'DATE(2020, 05, 14)',
'DATE(2020, 06, 12)',
'DATE(2020, 07, 16)',
'DATE(2020, 08, 17)',
'DATE(2020, 09, 15)',
'DATE(2020, 10, 16)',
'DATE(2020, 11, 19)',
'DATE(2020, 12, 16)',
'DATE(2021, 01, 16)',
'DATE(2021, 02, 13)',
'DATE(2021, 03, 16)',
'DATE(2021, 04, 15)',
# 'DATE(2021, 05, 07)',
# 'DATE(2021, 06, 04)',
# 'DATE(2021, 07, 02)',
# 'DATE(2021, 08, 15)',
# 'DATE(2021, 09, 24)',
# 'DATE(2021, 10, 10)',
# 'DATE(2021, 11, 15)',
# 'DATE(2021, 12, 05)',
# 'DATE(2022, 01, 09)',
]
# Cases up to these dates for charts.
DATE_UP_TO_DISPLAY_NAME_HISTORY = [
'May 14',
'Jun 12',
'Jul 16',
'Aug 17',
'Sep 15',
'Oct 16',
'Nov 19',
'Dec 16',
'Jan 16',
'Feb 13',
'Mar 16',
'Apr 15',
# 'May 7',
# 'Jun 4',
# 'Jul 2',
# 'Aug 15',
# 'Sep 24',
# 'Oct 10',
# 'Nov 15',
# 'Dec 5',
# 'Jan 9',
]
# Dates for Altair line charts; the 1st of the month formatted to the previous day.
DATE_YEAR_HISTORY = [
'2020-05-28',
'2020-06-28',
'2020-07-28',
'2020-08-28',
'2020-09-28',
'2020-10-28',
'2020-11-28',
'2020-12-28',
'2021-01-28',
'2021-02-28',
'2021-03-28',
'2021-04-28',
# '2021-05-28',
# '2021-06-28',
# '2021-07-28',
# '2021-08-28',
# '2021-09-28',
# '2021-10-28',
# '2021-11-28',
# '2021-12-28',
# '2022-01-28',
]
# Dates for Altair bar charts, which were less particular than the line charts.
DATE_YEAR_DISPLAY_NAME_HISTORY = [
'May 2020',
'Jun 2020',
'Jul 2020',
'Aug 2020',
'Sep 2020',
'Oct 2020',
'Nov 2020',
'Dec 2020',
'Jan 2021',
'Feb 2021',
'Mar 2021',
'Apr 2021',
# 'May 2021',
# 'Jun 2021',
# 'Jul 2021',
# 'Aug 2021',
# 'Sep 2021',
# 'Oct 2021',
# 'Nov 2021',
# 'Dec 2021',
# 'Jan 2022',
]
# Project and table names.
PROJECT_ID = 'msm-secure-data-1b'
def FullTableName(date):
return '`%s.ndunlap_secure.cdc_restricted_access_%s`' % (PROJECT_ID, date)
CDC_TABLE_HISTORY = [FullTableName(date) for date in DATE_HISTORY]
CDC_TABLE = CDC_TABLE_HISTORY[-1]
CDC_PUBLIC_GEO_TABLE = '`%s.ndunlap_secure.cdc_public_use_with_geo_%s`' % (PROJECT_ID, DATE_HISTORY[-1])
CRDT_TABLE = '`%s.ndunlap_secure.crdt_20210307`' % PROJECT_ID
CREW_TABLE = '`msm-internal-data.crew.covid_case_surveillance`'
# Dates in different formats.
DATE = DATE_UP_TO_HISTORY[-1]
DATE_DISPLAY_NAME = DATE_UP_TO_DISPLAY_NAME_HISTORY[-1]
CRDT_DATE = '20210307'
CRDT_COMPARE_DATE = 'DATE(2021, 03, 07)'
CRDT_COMPARE_DATE_DISPLAY_NAME = 'Mar 7'
#TOTAL_CRDT_CASES = 28756184 # from https://covidtracking.com/data/national
# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
TOTAL_CASES_SCALE_MAX = 4000000
COUNTY_CASES_SCALE_MAX = 1400000
COUNTY_CASES_ZOOM_SCALE_MAX = 200000
CASES_RACE_SCALE_MAX = 3000000 # known race/ethnicity
COUNTY_CASES_RACE_SCALE_MAX = 900000 # known race/ethnicity
# Chart settings.
SCATTER_HEIGHT = 300
SCATTER_WIDTH = 300
MAP_HEIGHT = 300
MAP_WIDTH = 450
US_STATES_TOPO = alt.topo_feature(data.us_10m.url, 'states')
US_COUNTIES_TOPO = alt.topo_feature(data.us_10m.url+"#", 'counties')
TERRITORIES = ('PR', 'GU', 'VI', 'MP', 'AS')
NYT_TERRITORIES = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
STATES_TO_FIPS = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12,
'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23,
'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34,
'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45,
'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56,
'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78, 'USVI': 78, 'NYC': 36}
FIPS_TO_STATES = {STATES_TO_FIPS[key]: key for key in STATES_TO_FIPS}
RACE_ETHNICITY_COMBINED_MAP = {
'Asian, Non-Hispanic': 'asian_cases',
'Black, Non-Hispanic': 'black_cases',
'White, Non-Hispanic': 'white_cases',
'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
'Hispanic/Latino': 'hispanic_cases',
'Multiple/Other, Non-Hispanic': 'other_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
'Missing': 'unknown_cases',
'Unknown': 'unknown_cases',
'NA': 'na_cases',
}
RACE_ETHNICITY_COMBINED_KNOWN_MAP = {
'Asian, Non-Hispanic': 'cdc_known_cases',
'Black, Non-Hispanic': 'cdc_known_cases',
'White, Non-Hispanic': 'cdc_known_cases',
'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
'Hispanic/Latino': 'cdc_known_cases',
'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
'Missing': 'cdc_unknown_cases',
'Unknown': 'cdc_unknown_cases',
'NA': 'cdc_na_cases',
}
#@title
CDC_OVERALL_RACE_QUERY = ('''
SELECT
race_ethnicity_combined,
COUNT(*) as cases
FROM
%s
GROUP BY
1
''' % CDC_TABLE)
NYT_US_QUERY_STR = ('''
SELECT
sum(confirmed_cases) as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
date = %s
''')
CRDT_QUERY = ('''
SELECT
State as state,
Cases_Total as crdt_cases,
Cases_Total - Cases_Unknown as crdt_known_race_cases,
ROUND(1 - Cases_Unknown / Cases_Total, 4) as crdt_known_race_cases_percent,
Cases_Total - Cases_Ethnicity_Unknown as crdt_known_ethnicity_cases,
ROUND(1 - Cases_Ethnicity_Unknown / Cases_Total, 4) as crdt_known_ethnicity_cases_percent,
FROM %s
WHERE
date = %s
''' % (CRDT_TABLE, CRDT_DATE))
NYT_STATES_QUERY_STR = ('''
SELECT
state_name,
state_fips_code,
confirmed_cases as nyt_cases,
deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
date = %s AND
state_fips_code IS NOT NULL
''')
NYT_STATES_QUERY = NYT_STATES_QUERY_STR % DATE
NYT_STATES_COMPARE_CRDT_QUERY = NYT_STATES_QUERY_STR % CRDT_COMPARE_DATE
CDC_STATES_QUERY_STR = ('''
SELECT
res_state,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_state
''')
CDC_STATES_QUERY = CDC_STATES_QUERY_STR % CDC_TABLE
NYT_COUNTIES_QUERY = ('''
SELECT
county_fips_code,
confirmed_cases as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
date = %s AND
county_fips_code IS NOT NULL
''' % DATE)
CDC_COUNTIES_RACE_QUERY = ('''
SELECT
res_state,
res_county,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % CDC_TABLE)
COUNTY_FIPS_MAPPING_QUERY = ('''
SELECT
*
FROM
`msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')
ACS_POPULATION_DATA_QUERY = ('''
SELECT
state,
county,
county_fips,
total_pop
FROM
`msm-internal-data.ipums_acs.acs_2019_5year_county`
''')
CDC_STATES_RACE_QUERY_STR = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_state,
race_ethnicity_combined
''')
CDC_STATES_RACE_QUERY = CDC_STATES_RACE_QUERY_STR % CDC_TABLE
CDC_STATES_RACE_UP_TO_CRDT_QUERY = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
WHERE
cdc_case_earliest_dt <= %s
GROUP BY
res_state,
race_ethnicity_combined
''' % (CDC_TABLE, CRDT_COMPARE_DATE))
CDC_PUBLIC_GEO_STATES_QUERY = ('''
SELECT
res_state,
COUNT(*) as cdc_public_geo_cases
FROM
%s
GROUP BY
res_state
''' % CDC_PUBLIC_GEO_TABLE)
RACE_ETHNICITY_COMBINED_STR = '''
CASE
WHEN ethnicity = "Non-Hispanic/Latino" AND race != "Missing" THEN race
WHEN ethnicity = "Hispanic/Latino" THEN ethnicity
WHEN ethnicity = "NA" THEN "NA"
WHEN ethnicity = "Missing" AND race = "Missing" THEN "Missing"
ELSE "Unknown"
END
'''
CDC_PUBLIC_GEO_COUNTIES_RACE_QUERY = ('''
SELECT
res_state,
res_county,
%s as race_ethnicity_combined,
COUNT(*) as cdc_public_geo_cases
FROM
%s
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % (RACE_ETHNICITY_COMBINED_STR, CDC_PUBLIC_GEO_TABLE))
CDC_PUBLIC_GEO_STATES_RACE_QUERY = ('''
SELECT
res_state,
%s as race_ethnicity_combined,
COUNT(*) as cdc_public_geo_cases
FROM
%s
GROUP BY
res_state,
race_ethnicity_combined
''' % (RACE_ETHNICITY_COMBINED_STR, CDC_PUBLIC_GEO_TABLE))
#@title
# This either works for a list of fields and one table or a list of tables and one field.
def FieldAnalysis(project_id, table_list, field_list, title, calculate_race_ethnicity=False):
field_dict = {}
if len(field_list) > 1:
list_for_iteration = field_list
table = table_list[0]
else:
list_for_iteration = table_list
field = field_list[0]
for item in list_for_iteration:
field_dict[item] = [0.0, 0.0, 0.0, 0.0]
unknowns = pd.DataFrame(field_dict, index=['Unknown', 'Missing', 'NA', 'Known'])
field_known_history = []
total_count_history = []
field_series = []
value_series = []
percent_series = []
cases_series = []
chart_denominator = 1000000
field_display_name = {
'cdc_case_earliest_dt': 'CDC earliest case date',
'current_status': 'Case status',
'case_month': 'Case month',
'res_state': 'State',
'res_county': 'County',
'sex': 'Sex',
'age_group': 'Age',
'race': 'Race',
'ethnicity': 'Ethnicity',
'race_ethnicity_combined': 'Race/Ethnicity',
}
for i, item in enumerate(list_for_iteration):
if len(field_list) > 1:
field = item
else:
table = item
field_unknowns_query = ('''
SELECT
%s,
count(*) as cases
FROM
%s
GROUP BY
%s
''')
if (field == 'race_ethnicity_combined' and (
calculate_race_ethnicity or item == CDC_TABLE_HISTORY[0])):
field_unknowns_query = ('''
SELECT ''' + RACE_ETHNICITY_COMBINED_STR + '''
as %s,
count(*) as cases
FROM
%s
GROUP BY
%s
''')
query = field_unknowns_query % (field, table, field)
field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
field_unknowns_df.set_index(field, inplace=True)
field_unknowns_df.index = field_unknowns_df.index.fillna('Null')
missing_count = 0
if 'Missing' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Missing'].cases
if 'Null' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Null'].cases
if '' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc[''].cases
if 'OTH' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['OTH'].cases
if 'nul' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['nul'].cases
unknowns.loc['Missing', field] = missing_count
unknown_count = 0
if 'Unknown' in field_unknowns_df.index:
unknown_count += field_unknowns_df.loc['Unknown'].cases
unknowns.loc['Unknown', field] = unknown_count
na_count = 0
if 'NA' in field_unknowns_df.index:
na_count += field_unknowns_df.loc['NA'].cases
unknowns.loc['NA', field] = na_count
unknowns.loc['Known', field] = field_unknowns_df.cases.sum() - (
unknowns.loc['Missing', field] +
unknowns.loc['Unknown', field] +
unknowns.loc['NA', field])
if len(field_list) > 1:
field_series.extend([field_display_name.get(field, field)] * 4)
else:
field_series.extend([DATE_YEAR_DISPLAY_NAME_HISTORY[i]] * 4)
field_known_history.append(unknowns.loc['Known', field] / field_unknowns_df.cases.sum())
total_count_history.append(field_unknowns_df.cases.sum())
value_series.extend(['Known', 'Suppressed', 'Unknown', 'Missing'])
percent_series.extend([unknowns.loc['Known', field] / field_unknowns_df.cases.sum(),
unknowns.loc['NA', field] / field_unknowns_df.cases.sum(),
unknowns.loc['Unknown', field] / field_unknowns_df.cases.sum(),
unknowns.loc['Missing', field] / field_unknowns_df.cases.sum()])
cases_series.extend([unknowns.loc['Known', field] / chart_denominator,
unknowns.loc['NA', field] / chart_denominator,
unknowns.loc['Unknown', field] / chart_denominator,
unknowns.loc['Missing', field] / chart_denominator])
bars_df = pd.DataFrame.from_dict({'field': field_series,
'value': value_series,
'percent': percent_series,
'cases': cases_series})
chart = alt.Chart(bars_df).mark_bar().encode(
x=alt.X('percent:Q', axis=alt.Axis(format='%'), title=''),
y=alt.Y('field:N', title='Field', sort=field_list),
color=alt.Color('value:N', scale=alt.Scale(scheme='category20'), title='Value'),
order=alt.Order('field:N'),
tooltip=[
alt.Tooltip('field:N', title='Field'),
alt.Tooltip('value:N', title='Value'),
alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
]
).properties(title=title)
return total_count_history, field_known_history, chart
def CreateNYTStateDataframe(query, include_territories=False):
nyt_states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
if not include_territories:
for territory in NYT_TERRITORIES:
nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)
return nyt_states_df
def CreateCDCStateDataframe(query):
states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df.rename(columns={'res_state': 'state'}, inplace=True)
states_df['state_fips_code'] = states_df.state
states_df = states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
states_df['state_fips_code'] = states_df.state_fips_code.astype(int)
states_df.set_index('state_fips_code', inplace=True)
return states_df
def CreateCDCStateRaceDataframe(query, cases_field_prefix, include_territories=False):
states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_KNOWN_MAP})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
cases_field = cases_field_prefix + 'cases'
crosstab_df = pd.crosstab(states_df['state'],
states_df.race_ethnicity_combined,
values=states_df[cases_field],
aggfunc=sum,
margins=True,
margins_name=cases_field
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df[cases_field_prefix + 'known_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df[cases_field_prefix + 'known_or_na_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df[cases_field_prefix + 'known_cases_percent'] = round(
crosstab_df[cases_field_prefix + 'known_cases'] /
crosstab_df[cases_field], 4)
crosstab_df[cases_field_prefix + 'known_or_na_cases_percent'] = round(
crosstab_df[cases_field_prefix + 'known_or_na_cases'] /
crosstab_df[cases_field], 4)
crosstab_df['state_fips_code'] = crosstab_df.state
crosstab_df = crosstab_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
# Remove territories and missing states for calculating summary stats.
if not include_territories:
for territory in TERRITORIES:
crosstab_df = crosstab_df[crosstab_df.state != territory]
crosstab_df = crosstab_df[crosstab_df.state != 'NA']
crosstab_df = crosstab_df[crosstab_df.state != 'Missing']
crosstab_df = crosstab_df[crosstab_df.state != 'Unknown']
crosstab_df.set_index('state_fips_code', inplace=True)
return crosstab_df
def CreateCRDTStateRaceDataframe(query):
crdt_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in TERRITORIES:
crdt_df = crdt_df[crdt_df.state != territory]
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_df['state_fips_code'] = crdt_df.state_fips_code.astype(int)
crdt_df.set_index('state_fips_code', inplace=True)
return crdt_df
def CreateNYTCountyDataframe(query):
nyt_counties_df = pd.io.gbq.read_gbq(NYT_COUNTIES_QUERY, project_id=PROJECT_ID)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)
return nyt_counties_df
def CreateCDCCountyRaceDataframe(query, cases_field_prefix):
cdc_counties_race_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
for territory in TERRITORIES:
cdc_counties_race_df = cdc_counties_race_df[cdc_counties_race_df.res_state != territory]
county_fips_map_df = pd.io.gbq.read_gbq(COUNTY_FIPS_MAPPING_QUERY, project_id=PROJECT_ID)
county_fips_map_df.cdc_county = county_fips_map_df.cdc_county.str.lower()
county_fips_map_df['state_county'] = county_fips_map_df.state + '-' + county_fips_map_df.cdc_county
county_fips_map_df['state_county'] = county_fips_map_df.state_county.astype('string').str.strip()
county_fips_map_df.set_index('state_county', inplace=True)
# Concatenate the state and county names because county names are not unique across states.
cdc_counties_race_df.res_county = cdc_counties_race_df.res_county.str.lower()
cdc_counties_race_df['state_county'] = cdc_counties_race_df.res_state + '-' + cdc_counties_race_df.res_county
cdc_counties_race_df['state_county'] = cdc_counties_race_df.state_county.astype('string').str.strip()
cdc_counties_race_df.set_index('state_county', inplace=True)
cdc_counties_race_df['race_ethnicity_combined'] = cdc_counties_race_df.race_ethnicity_combined.astype('string').str.strip()
cdc_counties_race_df = cdc_counties_race_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
# Printed value used in the footnotes below.
# All other checks for county_fips_code mappings are now in
# https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006
mismatches_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'missing']
# print(mismatches_df.cases.sum())
cdc_counties_race_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')
cases_field = cases_field_prefix + 'cases'
# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
cdc_counties_race_crosstab_df = pd.crosstab(cdc_counties_race_df['county_fips'],
cdc_counties_race_df.race_ethnicity_combined,
values=cdc_counties_race_df[cases_field],
aggfunc=sum,
margins=True,
margins_name=cases_field
)
# Have to reset_index() to go from pandas multi-index to single index.
cdc_counties_race_crosstab_df = cdc_counties_race_crosstab_df.reset_index()
cdc_counties_race_crosstab_df.drop(axis=0, index=len(cdc_counties_race_crosstab_df) - 1, inplace=True)
cdc_counties_race_crosstab_df['county_fips'] = cdc_counties_race_crosstab_df.county_fips.astype(int)
cdc_counties_race_crosstab_df[cases_field_prefix + 'known_cases'] = (
cdc_counties_race_crosstab_df[cases_field] -
cdc_counties_race_crosstab_df.na_cases.fillna(0) -
cdc_counties_race_crosstab_df.unknown_cases.fillna(0))
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
acs_name_lookup_df = pd.io.gbq.read_gbq(ACS_POPULATION_DATA_QUERY, project_id=PROJECT_ID)
acs_name_lookup_df['state_county'] = (acs_name_lookup_df.county.astype('string').str.strip() +
', ' + acs_name_lookup_df.state.astype('string').str.strip())
acs_name_lookup_df.drop(columns=['state', 'county'], inplace=True)
acs_name_lookup_df.set_index('county_fips', inplace=True)
cdc_counties_race_df = cdc_counties_race_crosstab_df.join(acs_name_lookup_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
cdc_counties_race_df.county_fips = cdc_counties_race_df.county_fips.astype(int)
cdc_counties_race_df.set_index('county_fips', inplace=True)
return cdc_counties_race_df
def CreateScatterPlot(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
axis_format = ',.0f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
axis_format = '.0%'
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
plot = alt.Chart(chart_df).mark_circle(size=60).encode(
alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
color=alt.Color(fields_dict['percent']['name'],
type='quantitative',
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True),
legend=alt.Legend(format=legend_format),
title=metric_type.capitalize()),
tooltip=tooltips,
).properties(
height=height,
width=width,
)
if metric_type == 'ratio':
plot.interactive()
line = pd.DataFrame({
'x': [0, scale_max],
'y': [0, scale_max],
})
if metric_type == 'ratio':
line_plot = alt.Chart(line).mark_line(color='black').encode(
x='x',
y='y',
)
elif metric_type == 'percent':
line_plot = (
alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
)
# Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
scatter = (plot + line_plot).properties(
title=title,
height=height,
width=width,
).interactive()
return scatter
def CreateMap(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
fips_code = 'state_fips_code'
topo_feature = US_STATES_TOPO
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
fips_code = 'county_fips'
topo_feature = US_COUNTIES_TOPO
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
field_names = [geo_field]
field_names.extend([fields_dict[field]['name'] for field in fields_dict])
plot = alt.Chart(topo_feature).mark_geoshape(
stroke='white',
strokeOpacity=.2,
strokeWidth=1
).project(
type='albersUsa'
).transform_lookup(
lookup='id',
from_=alt.LookupData(chart_df, fips_code, field_names)
).encode(
alt.Color(fields_dict['percent']['name'],
type='quantitative',
legend=alt.Legend(format=legend_format),
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True,
),
title=metric_type.capitalize()),
tooltip=tooltips
).add_selection(
highlight,
)
states_outline = alt.Chart(US_STATES_TOPO).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
type='albersUsa'
)
states_fill = alt.Chart(US_STATES_TOPO).mark_geoshape(
fill='silver',
stroke='white'
).project('albersUsa')
layered_map = alt.layer(states_fill, plot, states_outline).properties(
height=height,
width=width,
title=title,
)
return layered_map
def CreateScatterPlotAndMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
scatter = CreateScatterPlot(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
map = CreateMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
return (scatter | map).configure_view(
strokeWidth=0,
).configure_mark(
stroke='grey'
).configure_legend(
gradientLength=scatter_height - 50
)
def CreateLineChart(title, dates, race_ethnicity_known_history, overall_nyt_percent, overall_composite):
line_chart_dict = {}
line_chart_dict['date'] = dates * 3
line_chart_dict['metric'] = (['Percent with Race/Ethnicity'] * len(dates) +
['Percent of NYT total case counts'] * len(dates) +
['Composite percent'] * len(dates)
)
line_chart_dict['value'] = (race_ethnicity_known_history +
overall_nyt_percent +
overall_composite
)
line_chart_df = pd.DataFrame(line_chart_dict)
line = alt.Chart(line_chart_df).mark_line(point=True).encode(
x=alt.X('yearmonth(date):O', title='', axis=alt.Axis(labelAngle=0)),
y=alt.Y('value:Q', title='', axis=alt.Axis(format='%')),
color=alt.Color('metric', title='', scale=alt.Scale(scheme='browns'),
sort=['Percent of NYT total case counts',
'Percent with Race/Ethnicity',
'Composite percent',
]),
)
nearest = alt.selection(type='single', nearest=True, on='mouseover',
fields=['date'], empty='none')
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(line_chart_df).mark_point().encode(
x='yearmonth(date):O',
opacity=alt.value(0),
).add_selection(
nearest
)
# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=7, dy=-7).encode(
text=alt.condition(nearest, 'value:Q', alt.value(' '), format='.0%')
)
# Put the five layers into a chart and bind the data
line_chart = alt.layer(
line, selectors, points, text
).properties(
title=title,
width=550,
height=300
)
return line_chart
def PrintSummaryStats(chart_df, field='percent'):
plus_minus_15_df = chart_df[chart_df[field] >= .85]
plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
plus_minus_50_df = chart_df[chart_df[field] >= .50]
plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
print('< than .50: ', len(chart_df[chart_df[field] < .5]))
print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
print(chart_df[field].describe())
The Covid Tracking Project was the most reliable source for COVID-19 data with race/ethnicity at the state level until it stopped collecting data on March 7, 2021. The CDC's Case Surveillance Restricted Access and Public Use with Geography datasets are the only available replacements for the Covid Tracking Project's dataset, and they additionally include county-level data and age along with race/ethnicity. This paper evaluates the completeness of the CDC datasets at the state and county levels in terms of (1) the total number of cases included compared to the New York Times, and (2) the number of cases included with race/ethnicity data compared to the Covid Tracking Project.
The CDC's Restricted Access dataset contains 78% of the cases in the New York Times up to April 15, 2021, and 65% of cases have race/ethnicity information vs. 67% in the Covid Tracking Project. The dataset's completeness has steadily and gradually improved over time; e.g., the first available version from May 2020 had race/ethnicity information for only 43% of cases. At the state and county levels, the dataset's completeness has also improved with a state-level average of 62% of cases with race/ethnicity in April 2021 vs. 46% in June 2020. However, the dataset's completeness at the state level is highly variable; for example, Minnesota has 102% of the cases included in the New York Times, while Louisiana has only 4% of the cases in the New York Times. Minnesota has 91% of cases with race/ethnicity, while Louisiana has only 19% with race/ethnicity (vs. 94% in the Covid Tracking Project). Texas alone is missing 2.8M cases, accounting for more than a third of the total 7.1M missing cases. New York is missing race/ethnicity for 1.3M cases and California for 1.1M cases, accounting for more than a quarter of the 8.6M cases missing race/ethnicity when combined.
The CDC's Public Use with Geography dataset is similar to the Restricted Access dataset for total case counts, but is less complete due to more privacy suppression; e.g., only 49% of cases have race/ethnicity information.
The racial and ethnic disparities in the COVID-19 pandemic have exposed longstanding health inequities in the U.S., which have been described in multiple analyses of COVID-19 data by the Covid Tracking Project, New York Times, American Public Media Research Lab, and Kaiser Family Foundation among many others. However, we still don't have a full understanding of these disparities because race/ethnicity data continue to be incomplete more than a year after the pandemic started. As the Covid Tracking Project wrote on January 29, "the continued lack of either complete federal demographic data or federal guidelines for what states should publish make it impossible to fully understand who is being infected with and dying of COVID-19."
The most reliable and up-to-date data are scattered across state and local public health websites that use different standards and categories for reporting race/ethnicity. In 2020, collecting these data and turning them into a unified dataset was largely left to non-governmental organizations like the Covid Tracking Project, which stopped collecting data on March 7, 2021, a full year after it started. Even the federal government looked to the Covid Tracking Project for reliable COVID-19 race/ethnicity data. The office of the Assistant Secretary for Planning and Evaluation, an agency within the U.S. Department of Health and Human Services, wrote in October 2020, "The volunteer-based COVID tracking project has created the most comprehensive centralized resource for race and ethnicity data at the state level."
At the county level, finding race/ethnicity data for cases was nearly impossible for most of 2020. Since November 2020, the CDC has shown total case counts at the county level in a dashboard. Before the CDC published that data, several non-governmental organizations (New York Times, Johns Hopkins University, USAFacts) gathered data for total case counts at the county level. But none of these sources collected or published race/ethnicity data, which would have been a huge undertaking due to the non-standard reporting of race/ethnicity across state and local public health websites. The only public analysis of case data with race/ethnicity at the county level was in July 2020 when the New York Times published The Fullest Look Yet at the Racial Inequity of Coronavirus. The New York Times used CDC Case Surveillance data that they obtained via FOIA and legal action to analyze cases up to May 28, 2020.
Now that the Covid Tracking Project has stopped collecting data, there is only one option for COVID-19 case data with race/ethnicity unified across U.S. states and counties: the CDC's Case Surveillance data, which are based on state and local health departments reporting cases to the CDC. There are more options for data on deaths, which we discuss in a separate deaths data report.
The CDC publishes several variations of the Case Surveillance data:
These three datasets are from the same underlying source; they have the same number of records and are released every two weeks (monthly until May 2021). The Public Use dataset was first published in May 2020. The Restricted Access dataset was first published at the link above in November 2020; the dataset itself is in a private GitHub repository that the CDC shares with people who apply for access and adhere to a data use agreement. The CDC's initial data agreement did not allow for county-level analyses to be made public, but an updated data agreement from December 14, 2020 allowed such analyses. In January 2021, the Morehouse School of Medicine's Satcher Health Leadership Institute (MSM/SHLI) in collaboration with Google.org applied for and got access to this data within a few days. Several months later in March 2021, the CDC published the Public Use with Geography dataset with state and county information, fewer fields, and more data suppression than the Restricted Access dataset. In addition to these datasets, the CDC also publishes charts showing race/ethnicity breakdowns at the U.S. level in their COVID Data Tracker; see the Appendix for a comparison of that tracker to these datasets.
The CDC's Case Surveillance Restricted Access dataset has enormous potential: It could allow us to analyze data across all states and counties to study the disparities in COVID-19 cases using consistent race/ethnicity categories. This dataset could pick up where the Covid Tracking Project's dataset left off and additionally enable the first analyses of race/ethnicity disparities at the county level since July 2020. The dataset also has age and sex for each case, so we could analyze the intersection of race/ethnicity with age and sex. The dataset has case report dates, which would allow us to look at cases over time. However, the dataset will only live up to its potential if it is complete both in terms of the number of cases included and the number of cases that have race/ethnicity.
Our analysis shows that the CDC's Case Surveillance Restricted Access dataset has two main data completeness issues:
For the 15.8M cases where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:
#@title
overall_df = pd.io.gbq.read_gbq(CDC_OVERALL_RACE_QUERY, project_id=PROJECT_ID)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
overall_df = overall_df.set_index('race_ethnicity_combined')
chart_denominator = 1000000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
overall_df.cases['black_cases'] / chart_denominator,
overall_df.cases['white_cases'] / chart_denominator,
overall_df.cases['asian_cases'] / chart_denominator,
overall_df.cases['nhpi_cases'] / chart_denominator,
overall_df.cases['aian_cases'] / chart_denominator,
overall_df.cases.sum() / chart_denominator,
]
# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
60481746 / chart_denominator,
40596040 / chart_denominator,
196789401 / chart_denominator,
18427914 / chart_denominator,
565473 / chart_denominator,
2236348 / chart_denominator,
328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'*Total Including Unknowns*',
], 'percent': percent_list,
'cases': cases_list,
'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.1%'), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='tableau20'),
title='',
legend=None),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.2%', title='Prevalence within group'),
alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
alt.Tooltip('population:Q', format=',.2f', title='Population of group (millions)'),
]
).properties(
title='Percent of Race/Ethnicity Group who had COVID-19 based on Incomplete CDC Data up to %s' % DATE_DISPLAY_NAME
)
alt.concat(bars).properties(
title=alt.TitleParams(
['Population data source: The U.S. Census Bureau\'s American Community Survey 2019 5-year estimates.'],
baseline='bottom',
dy=20,
orient='bottom',
fontWeight='normal',
fontSize=11
)
).display()
The chart above is based on incomplete data. With only 78% of cases included, the total percent of people who had COVID-19 should be 9.6% instead of 7.4%. It's harder to estimate how much the race/ethnicity groups are undercounting the number of confirmed COVID-19 cases. If we added all 8.6M cases with missing race/ethnicity to the Hispanic/Latino group, the percent of Hispanic/Latinos in the U.S. who had COVID-19 would go from 7.4% to 21.7% — a 3x increase. If all 8.6M cases with missing race/ethnicity were Black people, the percent of Black people who had COVID-19 would go from 4.3% to 25.5% — a 6x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.
At the same time, the data from state public health websites are not perfect; the Covid Tracking Project only has race/ethnicity data for 67% of cases up to March 7. We'll compare data completeness at the state level in the Case Surveillance data vs. the New York Times and the Covid Tracking Project to better understand how the data completeness compares between those datasets.
The goal of this analysis is to assess the completeness of the CDC's Restricted Access and Public Use with Geography datasets, which will help evaluate their usefulness in examining disparities in race/ethnicity for COVID-19 cases at the state and county levels. We'll first focus on the Restricted Access dataset, which is more complete than the Public Use with Geography dataset due to privacy reasons.
The overall data completeness findings for the CDC's Restricted Access Dataset are:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
_, _, chart = FieldAnalysis(PROJECT_ID, [CDC_TABLE], field_list, 'Most Complete Fields in the CDC Restricted Access Dataset')
chart.display()
We can evaluate the overall completeness of the CDC datasets by calculating at (1) the percent of total case counts compared to the NYT, (2) the percent of cases with race/ethnicity, and then (3) combine those two percentages into a composite that represents the percentage of total expected cases that have race/ethnicity. Later on, we will do this same analysis at the state and county levels.
#@title
# Manually update these fields based on chart above, latest CDC data,
# and improving state/county data below.
overall_row_names = [
'Update frequency',
'Latest case date',
'Cases in dataset as of date',
'Cases in NYT as of date',
'(as a % of NYT)',
'Cases with race/ethnicity',
'(as a % of cases in dataset)',
'(composite % of NYT total with race/ethnicity)',
]
overall_crdt_metadata = [
'Stopped',
'Mar 7, 2021',
'28.8M',
'29.0M',
'(99%)',
'19.2M',
'(67%)',
'(66%)',
]
overall_cdc_metadata = [
'Every two weeks',
'April 15, 2021',
'24.4M',
'31.5M',
'(78%)',
'15.8M',
'(65%)',
'(50%)',
]
overall_cdc_public_geo_metadata = [
'Every two weeks',
'April 15, 2021',
'24.4M',
'31.5M',
'(78%)',
'12.0M',
'(49%)',
'(38%)',
]
table_data = {'CRDT': overall_crdt_metadata, 'CDC Restricted': overall_cdc_metadata, 'CDC Public Geo': overall_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=overall_row_names)
metadata_df.head(15)
The CDC Restricted Access Dataset's completeness has improved over time. Each new data release updates historical cases to add more information and even new cases. The percent of cases with race/ethnicity started at 43% in May 2020 and is now at 65% in April 2021. The composite measure with the percent of NYT total cases that have race/ethnicity increased from 33% in May 2020 to 50% in April 2021.
#@title
total_count_history, race_ethnicity_known_history, historical_bar_chart = FieldAnalysis(
PROJECT_ID, CDC_TABLE_HISTORY, ['race_ethnicity_combined'],
'Race/Ethnicity Completeness by Dataset Release Date')
overall_nyt_percent = []
for i, table in enumerate(CDC_TABLE_HISTORY):
nyt_us = pd.io.gbq.read_gbq(NYT_US_QUERY_STR % DATE_UP_TO_HISTORY[i], project_id=PROJECT_ID)
overall_nyt_percent.append(total_count_history[i] / nyt_us.nyt_cases[0])
overall_composite = []
for i, _ in enumerate(total_count_history):
overall_composite.append(race_ethnicity_known_history[i] * overall_nyt_percent[i])
#@title
CreateLineChart('CDC Restricted Access Dataset Completeness by Release Date',
DATE_YEAR_HISTORY,
race_ethnicity_known_history,
overall_nyt_percent,
overall_composite
).display()
We can also look more closely at how the percent of cases with race/ethnicity increased over time.
#@title
historical_bar_chart.display()
The improvements in data completeness were due to an increase in the percent cases with known race/ethnicity and a decrease in the percent of cases with data suppression over time from 9% in June 2020 to 1% in April 2021. Note that the May 2020 dataset had no data suppression possibly because it also had no state or county information.
At the end of this report, we'll look at how the CDC Restricted Access dataset's completeness has improved over time at the state level.
What we didn't include in this report:
The CDC datasets come from a case report form that is a dense, five-page form about each lab-confirmed or probable COVID-19 case (old form prior to Jan 15, 2021). The Restricted Access dataset contains 32 fields, the Public Use with Geography dataset contains 19 fields, and the Public Use version contains 12 fields. In addition to data from the case report form, the CDC separately collects aggregate data without race/ethnicity information.
The CDC distinguishes between aggregate data that comes from state and local public health websites vs. line- or case-level data that comes to the CDC from public health departments. The CDC FAQs say that aggregate data are more accurate than case data:
Aggregate counts provide the most up-to-date validated numbers on cases and deaths.
Public health websites often do contain race/ethnicity details, but state websites do not all use the same standard race/ethnicity categories, which makes it hard to collect and aggregate the data. The CRDT captured the many non-standard ways in which different states report on race/ethnicity, where ethnicity is whether a person is Hispanic/Latino. Some states report race/ethnicity as a combined field where each race/ethnicity group is mutually exclusive, which is how the CDC case dataset reports this field. Other states report race/ethnicity as separate fields where Hispanic/Latino people are counted within different race groups as well as in a separate field for ethnicity. States can also differ in terms of which race categories they use, how they define them, whether multiracial people are counted multiple times in different categories, and what's included in the "Other" race category. For more details, see this CRDT analysis.
So, aggregate data are more accurate and timely than case-level data, but case-level data use standardized race/ethnicity categories across all states and counties. If we want to use case-level data, we need to better understand the severity of these accuracy and timeliness issues.
The CDC has an FAQ about data completeness:
How complete are the data that the CDC receives about COVID-19 cases?
The COVID-19 pandemic has put unprecedented demands on the public health data supply chain. In many states, the large number of COVID-19 cases has severely strained the ability of hospitals, healthcare providers, and laboratories to report cases with complete demographic information, such as race and ethnicity. The unprecedented volume of cases has also limited the ability of state and local health departments to conduct thorough case investigations and collect all requested case data.
As a result, many COVID-19 case notifications submitted to CDC do not have complete information on patient demographics [...] Because it can be time-consuming for jurisdictions to collect the additional information, these data can lag behind the aggregate counts. Because of missing data, analyses of these data elements are likely an underestimate of the true occurrence.
The CDC's COVID Data Tracker says that they are "working with states to provide more information on race/ethnicity for reported cases. The percent of reported cases that include race/ethnicity data is increasing."
We will compare the CDC data against two sources of aggregate data: The NYT and the CRDT's public data, which are aggregated from state and local public health websites. The CRDT is the only source for case data with race/ethnicity breakdowns, but there are several sources for county-level total case counts in addition to the NYT, such as JHU and USAFacts (this paper analyzes the differences between those sources at the state level up to July, 2020 for cases and deaths).
The table below compares geographic vs. race/ethnicity availability for these three different data sources:
#@title
row_names = [
'Total Cases — States',
'Total Cases — Counties',
'Cases by Race/Ethnicity — States',
'Cases by Race/Ethnicity — Counties'
]
nyt_yn = [
'✅',
'✅',
'❌',
'❌',
]
crdt_yn = [
'✅',
'❌',
'✅',
'❌',
]
cdc_yn = [
'✅',
'✅',
'✅',
'✅',
]
table_data = {'NYT': nyt_yn, 'CRDT': crdt_yn, 'CDC': cdc_yn}
availability_df = pd.DataFrame(table_data, index=row_names)
availability_df.head()
Because the CDC is the only data source that has race/ethnicity at the county level, the most similar data for purposes of comparison are (1) NYT data at the state and county levels with no race/ethnicity, and (2) CRDT data at the state level with race/ethnicity.
We will compare against the NYT up to April 15, 2021, which is the latest reporting date in the CDC data. We will compare against the CRDT up to March 7, 2021, which was the last date they reported data. We expect to see small differences (e.g., +/- 15%) in the case counts due to time lags in reporting the data, but time lags cannot explain large percentages of missing cases.
To get a baseline of how much we could expect the CDC case counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the CRDT and NYT case counts are equal.
#@title
crdt_states_df = pd.io.gbq.read_gbq(CRDT_QUERY, project_id=PROJECT_ID)
crdt_states_df.set_index('state', inplace=True)
nyt_states_compare_crdt_df = CreateNYTStateDataframe(NYT_STATES_COMPARE_CRDT_QUERY)
crdt_states_df.reset_index(inplace=True)
crdt_states_df['state_fips_code'] = crdt_states_df.state
crdt_states_df = crdt_states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_states_df.set_index('state_fips_code', inplace=True)
nyt_crdt_counts_df = nyt_states_compare_crdt_df.join(crdt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
nyt_crdt_counts_df['percent'] = round(nyt_crdt_counts_df.crdt_cases / nyt_crdt_counts_df.nyt_cases, 2)
nyt_crdt_counts_df.reset_index(inplace=True)
#@title
nyt_crdt_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CRDT to NYT'},
}
nyt_crdt_title = 'Ratio of CRDT to NYT Cases by State up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
nyt_crdt_counts_df, nyt_crdt_fields_dict, nyt_crdt_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(nyt_crdt_counts_df)
The ratio of CRDT to NYT cases is between 0.92 and 1.03 for all states:
We can see below that the CDC Restricted Access dataset case counts differ from the NYT case counts more drastically than the CRDT did. Note: In the analysis and charts below, we'll refer to the CDC Restricted Access dataset as the "CDC" dataset.
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY)
cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
#@title
cdc_nyt_state_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_state_title = 'Ratio of CDC to NYT Cases by State up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_nyt_states_df)
#cdc_nyt_states_df.sort_values(by='percent')
Texas alone is missing 2.8M cases compared to the total case counts in the NYT data (39% of the 7.1M missing cases across all states).
The ratio of CDC to NYT cases is between 0.02 and 1.04 for all states + D.C.:
The 29 states that are within +/-15% of the NYT data could plausibly be off due to time lags in reporting cases to the CDC vs. reporting them on state public health websites, but there are many outlier states that are too far off from the NYT case counts to be explained by a time lag:
We can do the same analysis at the county level. The CDC Restricted Access dataset contains 3,072 counties in the 50 states + D.C., which is 98% of all counties that account for 99.9% of the population.
Each dot is a county (hover to see details). We show all 3,063 counties in the CDC data that were also in the NYT data on the left and zoom in on the smaller counties on the right. Note that the five counties in New York City and four boroughs in Alaska are missing because the NYT combined them into one region for New York City and two combined regions in Alaska; see the Appendix for more details.
#@title
# CDC vs. NYT county
cdc_counties_race_df = CreateCDCCountyRaceDataframe(CDC_COUNTIES_RACE_QUERY, 'cdc_')
nyt_counties_df = CreateNYTCountyDataframe(NYT_COUNTIES_QUERY)
cdc_nyt_counties_race_df = cdc_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_nyt_counties_race_df = cdc_nyt_counties_race_df.reset_index()
cdc_nyt_counties_race_df['percent'] = round(cdc_nyt_counties_race_df.cdc_cases / cdc_nyt_counties_race_df.nyt_cases, 2)
cdc_nyt_counties_race_df['cdc_known_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_cases /
cdc_nyt_counties_race_df.cdc_cases, 2)
cdc_nyt_counties_race_df['cdc_known_or_na_cases'] = (cdc_nyt_counties_race_df.cdc_known_cases.fillna(0) +
cdc_nyt_counties_race_df.na_cases.fillna(0))
cdc_nyt_counties_race_df['cdc_known_or_na_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_or_na_cases /
cdc_nyt_counties_race_df.cdc_cases, 2)
#PrintSummaryStats(cdc_nyt_counties_race_df)
# These values are used in the num_counties table.
#print(len(cdc_counties_race_df))
#print(len(cdc_counties_race_df) / 3143)
#print(cdc_counties_race_df.total_pop.sum())
#print(cdc_counties_race_df.total_pop.sum() / 324697795) # Population covered in these counties
#print(cdc_counties_race_df.cdc_known_cases.sum())
#print(0.55 * 324697795) # NYT population
#@title
cdc_nyt_county_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_county_title = 'Ratio of CDC to NYT Cases by County up to %s' % DATE_DISPLAY_NAME
zoom_cdc_nyt_title = 'Zoom in on counties with up to 200,000 Cases'
cdc_nyt_county_scatter = CreateScatterPlot(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
cdc_nyt_county_zoom_scatter = CreateScatterPlot(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, zoom_cdc_nyt_title, COUNTY_CASES_ZOOM_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
(cdc_nyt_county_scatter | cdc_nyt_county_zoom_scatter).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df)
Harris County, Texas is missing 383K cases compared to the total cases in the NYT data (5% of the 7.1M missing cases across all states).
The ratio of CDC to NYT cases is between 0.00 and 3.2 for the 3,063 counties in the CDC data that were also in the NYT data:
We can also view these ratios on the map on the right and compare them to the state-level totals map on the left.
#@title
cdc_nyt_states_title = 'Ratio of CDC to NYT Cases by County up to %s' % DATE_DISPLAY_NAME
cdc_nyt_county_map = CreateMap(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'ratio'
)
cdc_nyt_state_map = CreateMap(
cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_states_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'ratio'
)
(cdc_nyt_state_map | cdc_nyt_county_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
Notes:
We can see that the ratio of the CDC case data to NYT aggregate data is highly variable across the U.S., but there is less variability across the counties within each state. This pattern indicates that those data completeness issues may be due to policies or data collection pipelines at the state level rather than the county level. We can also see that some counties are missing entirely from the data; e.g., in Texas and Wyoming. It's possible some of these counties have cases in the data but the county name was suppressed for privacy reasons due to small population sizes. Even so, those cases would still have a state name, and so they would be captured in the map on the left above.
How much race/ethnicity information is available in the CDC data at the state and county levels?
#@title
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_')
cdc_states_race_df.reset_index(inplace=True)
cdc_race_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
cdc_states_race_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_states_race_map = CreateMap(
cdc_states_race_df, cdc_race_fields_dict, cdc_states_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
cdc_counties_race_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_counties_race_map = CreateMap(
cdc_nyt_counties_race_df, cdc_race_fields_dict, cdc_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'percent'
)
(cdc_states_race_map | cdc_counties_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_states_race_df, 'cdc_known_cases_percent')
Note:
New York is missing race/ethnicity for 1.3M cases and California for 1.1M cases (27% of the 8.6M total cases missing race/ethnicity when combined).
There's high variability at the state level:
At the county level, many states have similar percentages of cases with race/ethnicity at the county level as they do at the state level, while other states such as Texas, West Virginia, New Mexico, and Missouri have a high variability at the county level. Note that states and counties with small populations will have a higher level of data suppression, which will result in a lower percentage of cases with race/ethnicity. Unfortunately, we can't extrapolate the percentage of cases that would have had race/ethnicity before data suppression because data suppression can apply to cases with known race/ethnicity or with unknown race/ethnicity (see page 9 of this Privacy Paper for details).
How does the CDC dataset compare to the CRDT dataset, which is the most up-to-date aggregate dataset for race/ethnicity at the state level up to March 7, 2021? Overall, 67% of the cases in the CRDT data have race information and 59% have ethnicity information. In the CDC data up to March 7, 65% of cases have race/ethnicity combined information.
We will use the race field in the CRDT data as a proxy for a combined race/ethnicity field. For some states, the race field is a combined race/ethnicity field that exactly matches how the CDC Restricted Access dataset reports race/ethnicity. However, the CRDT also captures the many non-standard ways that states report race/ethnicity, as described in this Covid Racial Data Tracker analysis. There's no way to do an exact comparison between the standardized race/ethnicity category in the CDC data and the many ways that race and ethnicity are reported in the CRDT. We use the race field in the CRDT dataset because it's a closer approximation of the combined race/ethnicity field than the ethnicity field.
#@title
cdc_up_to_crdt_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_UP_TO_CRDT_QUERY, 'cdc_')
crdt_df = CreateCRDTStateRaceDataframe(CRDT_QUERY)
cdc_crdt_race_df = cdc_up_to_crdt_race_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
cdc_crdt_race_df.reset_index(inplace=True)
cdc_crdt_race_df['percent'] = round(cdc_crdt_race_df.cdc_known_cases / cdc_crdt_race_df.crdt_known_race_cases, 4)
#@title
crdt_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT cases'},
'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
crdt_race_title = 'CRDT Percent Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
crdt_race_map = CreateMap(
cdc_crdt_race_df, crdt_race_fields_dict, crdt_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH - 5, 'state', 'percent'
)
cdc_states_race_up_to_crdt_title = 'CDC Percent of Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
cdc_states_race_up_to_crdt_map = CreateMap(
cdc_crdt_race_df, cdc_race_fields_dict, cdc_states_race_up_to_crdt_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
(cdc_states_race_up_to_crdt_map | crdt_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_crdt_race_df, 'crdt_known_race_cases_percent')
Note that we only looked at CDC cases up to March 7, 2021 when comparing against the CRDT.
For the CRDT:
Overall, the CRDT has a higher percentage of cases with known race/ethnicity than CDC at the state level. Although it appears that the CDC has better data for Texas than the CRDT, the maps above don't account for the fact that the CDC only contains 3% of the cases in the CRDT data. To take that into account, we can compare the number of cases within each state that has known race/ethnicity instead of the percent of cases.
#@title
cdc_crdt_race_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT cases with race/ethnicity'},
'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC cases with race/ethnicity'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_race_title = 'Ratio of CDC to CRDT Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_crdt_race_df, cdc_crdt_race_fields_dict, cdc_crdt_race_title, CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_crdt_race_df[cdc_crdt_race_df.state != 'NY'])
Notes:
CRDT has race/ethnicity data for 612K more people in Florida than the CDC data has.
The ratio of CDC to CRDT cases with race/ethnicity is between 0.01 and 1.04 for all states excluding New York:
Overall, the CRDT is a more complete source for race/ethnicity dataset at the state level than the CDC data in terms of both the counts of cases with race/ethnicity data and the percentage of cases with race/ethnicity data. The only exceptions are New York (which has no cases with race/ethnicity in the CRDT), Massachusetts (1.04 times as many cases with race/ethnicity than the CRDT), and Utah (1.00 as many cases).
The Public Use with Geography dataset offers a way to analyze data at the state and county level without applying for access. The dataset is released on the same day as the Restricted Access dataset and includes the same total number of cases. There are several differences from the Restricted Access dataset:
Due to the increased privacy measures, race/ethnicity is suppressed for 20% of cases vs. 1% in the Restricted Access dataset.
#@title
field_list = ['case_month', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race', 'ethnicity', 'race_ethnicity_combined']
_, _, bar_chart = FieldAnalysis(PROJECT_ID, [CDC_PUBLIC_GEO_TABLE], field_list, 'Most Complete Fields in the CDC Public Geo Dataset', calculate_race_ethnicity=True)
bar_chart.display()
We can also see that far fewer counties are included in the dataset, 47% of counties vs. 98% in the Restricted Access dataset, although they are generally counties with large populations, so they still account for 89% percent of the U.S. population.
#@title
# Manually update these fields based on the latest CDC data.
num_counties_row_names = [
'(% of NYT cases)',
'(% of cases with race/ethnicity)',
'(composite % of NYT total with race/ethnicity)',
'Number of counties',
'(as a % of all counties)',
'Population in those counties',
'(as a % of total U.S population – States + D.C.)',
]
num_counties_cdc_metadata = [
'(78%)',
'(65%)',
'(50%)',
'3,072',
'(98%)',
'324M',
'(99.9%)',
]
num_counties_cdc_public_geo_metadata = [
'(78%)',
'(49%)',
'(38%)',
'1,495',
'(47%)',
'289M',
'(89%)',
]
table_data = {'CDC Restricted': num_counties_cdc_metadata,
'CDC Public Geo': num_counties_cdc_public_geo_metadata,
}
metadata_df = pd.DataFrame(table_data, index=num_counties_row_names)
metadata_df.head(15)
The total case counts for the CDC Public Use with Geography ("CDC Public Geo" below) and CDC Restricted Access dataset ("CDC" below) are similar with only minor differences due to differing privacy restrictions.
#@title
cdc_public_geo_states_df = CreateCDCStateDataframe(CDC_PUBLIC_GEO_STATES_QUERY)
cdc_public_geo_states_df = cdc_states_df.join(cdc_public_geo_states_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
cdc_public_geo_states_df.reset_index(inplace=True)
cdc_public_geo_states_df['percent'] = round(cdc_public_geo_states_df.cdc_public_geo_cases / cdc_public_geo_states_df.cdc_cases, 4)
#@title
cdc_public_geo_state_fields_dict = {
'x': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC Public Geo to NYT'},
}
cdc_public_geo_state_title = 'Ratio of CDC Public Geo to CDC Cases by State up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_states_df, cdc_public_geo_state_fields_dict, cdc_public_geo_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_states_df)
When we compare case counts at the county level, we can see that the Public Geo dataset only contains 1,471 of the 3,072 counties in the Restricted Access dataset. We can also see that these counties are those that have larger populations (at least 20K people and 1K cases).
#@title
cdc_public_geo_counties_race_df = CreateCDCCountyRaceDataframe(CDC_PUBLIC_GEO_COUNTIES_RACE_QUERY, 'cdc_public_geo_')
cdc_public_geo_counties_race_df = cdc_counties_race_df.join(cdc_public_geo_counties_race_df, on="county_fips", how='left', lsuffix='', rsuffix='_right')
cdc_public_geo_counties_race_df = cdc_public_geo_counties_race_df.reset_index()
cdc_public_geo_counties_race_df['percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_cases / cdc_public_geo_counties_race_df.cdc_cases, 2)
cdc_public_geo_counties_race_df['cdc_known_cases_percent'] = round(cdc_public_geo_counties_race_df.cdc_known_cases /
cdc_public_geo_counties_race_df.cdc_cases, 2)
cdc_public_geo_counties_race_df['cdc_public_geo_known_cases_percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_known_cases /
cdc_public_geo_counties_race_df.cdc_public_geo_cases, 2)
cdc_public_geo_counties_race_df['known_percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_known_cases /
cdc_public_geo_counties_race_df.cdc_known_cases, 2)
cdc_public_geo_only_counties_race_df = cdc_public_geo_counties_race_df[cdc_public_geo_counties_race_df.cdc_public_geo_cases.isna() == False]
#PrintSummaryStats(cdc_public_geo_only_counties_race_df)
# These values are used in the tables about coverage.
#print(len(cdc_public_geo_only_counties_race_df))
#print(len(cdc_public_geo_only_counties_race_df) / 3143)
#print(cdc_public_geo_only_counties_race_df.total_pop.sum())
#print(cdc_public_geo_only_counties_race_df.total_pop.sum() / 324697795) # Population covered in these counties
#print(cdc_public_geo_only_counties_race_df.cdc_public_geo_known_cases.sum())
#@title
cdc_public_geo_county_fields_dict = {
'x': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public Geo cases'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CDC Public Geo'},
}
cdc_public_geo_county_title = 'Ratio of CDC Public Geo to CDC Cases by County up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_counties_race_df, cdc_public_geo_county_fields_dict, cdc_public_geo_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_counties_race_df)
We can also compare the percent of cases with race/ethnicity in the Public Geo dataset on the vs. the Restricted Access dataset.
#@title
cdc_public_geo_states_race_df = CreateCDCStateRaceDataframe(CDC_PUBLIC_GEO_STATES_RACE_QUERY, 'cdc_public_geo_')
cdc_public_geo_states_race_df = cdc_states_race_df.join(cdc_public_geo_states_race_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
cdc_public_geo_states_race_df.reset_index(inplace=True)
cdc_public_geo_states_race_df['percent'] = round(cdc_public_geo_states_race_df.cdc_public_geo_known_cases / cdc_public_geo_states_race_df.cdc_known_cases, 4)
#@title
cdc_public_geo_states_race_fields_dict = {
'x': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'Cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public Geo cases'},
'percent': {'name': 'cdc_public_geo_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
cdc_public_geo_states_race_title = 'CDC Public Geo Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_public_geo_states_race_map = CreateMap(
cdc_public_geo_states_race_df, cdc_public_geo_states_race_fields_dict, cdc_public_geo_states_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'state', 'percent'
)
(cdc_public_geo_states_race_map | cdc_states_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#PrintSummaryStats(cdc_public_geo_states_race_df, field='cdc_public_geo_known_cases_percent')
In most states, the Public Geo dataset has a lower percentage of cases with race/ethnicity information than the Restricted Access dataset.
We can compare these percentages more directly by taking the ratio of the count of cases with race/ethnicity in each dataset at the state level.
#@title
cdc_public_geo_states_race_ratio_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'CDC Public Geo cases with race/ethnicity'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC Public Geo to CDC'},
}
cdc_public_geo_states_race_ratio_title = 'Ratio of CDC Public Geo to CDC Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_states_race_df, cdc_public_geo_states_race_ratio_fields_dict, cdc_public_geo_states_race_ratio_title, CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_states_race_df)
As expected, most states have fewer cases with race/ethnicity information in the Public Geo dataset than in the Restricted Access dataset.
The ratio of CDC Public Geo to CDC Restricted Access cases with race/ethnicity is between 0 and 1.40 for all states:
Interestingly, there are three states that have more cases with race/ethnicity in the Public Geo dataset: Louisiana, Wyoming, and West Virginia. Our best guess to explain this is that it's a consequence of the privacy measures. In states with counties (parishes in Louisiana) with small populations or low case counts, the county names will be suppressed and changed to "NA." Once those cases are combined into an "NA" county, there may be enough cases to exceed the threshold for having at least 11 cases with the unique combination of fields including race and ethnicity. In the Restricted Access dataset, those cases will be listed under their county names and may not meet the bar for having at least 5 cases with the unique combination of fields including race/ethnicity. So, by pooling together cases from small counties into an "NA" county, it's possible to have race/ethnicity information for more cases in the Public Geo dataset than in the Restricted Access dataset at the state level.
At the county level, we can see that the Public Geo dataset has the same or fewer cases with race/ethnicity information than the Restricted Access dataset in every county.
#@title
cdc_public_geo_race_fields_dict = {
'x': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'CDC Public Geo cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_cases', 'format': ',', 'title': 'CDC Public Geo cases'},
'percent': {'name': 'cdc_public_geo_known_cases_percent', 'format': '.0%', 'title': 'Percent cases with race/ethnicity'},
}
cdc_public_geo_counties_race_title = 'CDC Public Geo Percent of Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
cdc_public_geo_counties_race_map = CreateMap(
cdc_public_geo_counties_race_df, cdc_public_geo_race_fields_dict, cdc_public_geo_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT, MAP_WIDTH, 'county', 'percent'
)
(cdc_public_geo_counties_race_map | cdc_counties_race_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
cdc_public_geo_counties_race_df['known_percent'] = round(cdc_public_geo_counties_race_df.cdc_public_geo_known_cases /
cdc_public_geo_counties_race_df.cdc_known_cases, 2)
cdc_public_geo_compare_race_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC cases with race/ethnicity'},
'y': {'name': 'cdc_public_geo_known_cases', 'format': ',', 'title': 'CDC Public Geo cases with race/ethnicity'},
'percent': {'name': 'known_percent', 'format': '.2f', 'title': 'Ratio of CDC Public Geo to CDC'},
}
cdc_public_geo_compare_race_title = 'Ratio of CDC Public Geo to CDC Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
CreateScatterPlotAndMap(
cdc_public_geo_counties_race_df, cdc_public_geo_compare_race_fields_dict, cdc_public_geo_compare_race_title, COUNTY_CASES_RACE_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'county', 'ratio'
).display()
#PrintSummaryStats(cdc_public_geo_counties_race_df, field='known_percent')
How can we use these datasets at the state and county levels?
When using these datasets, we recommend being transparent about the level of completeness or lack thereof in (1) the total case counts, and (2) the percentage of cases with race/ethnicity. Just like we created a composite measure of the overall datasets along these lines, we do the same at the state and county levels in the charts below. You can think of these as "nutrition facts" labels for each state and county's data.
We also recommend looking at the race/ethnicity breakdowns within each state or county to see if race/ethnicity information is missing disproportionately from one group; e.g., North Dakota is missing data entirely for all groups other than Asian and American Indian/Alaska Native. If states or counties' data are too incomplete to draw conclusions from, you may want to exclude them entirely from your analyses. You may also find it useful to include unknowns to highlight the incompleteness in the data.
The scatterplots below can help show the completeness issues in each state and county. The scatterplots show (1) case counts as a percentage of the NYT total case counts on the y-axis, and (2) the percentage of cases with known race/ethnicity on the x-axis. The colors of the dots and states on the map show the composite completeness measure by multiplying those two numbers together, which is the composite percentage of total expected cases that have race/ethnicity in the datasets (blue is more complete and red is less complete).
We show the datasets in order from most to least data completeness: CRDT, CDC Restricted Access, and CDC Public Geo.
#@title
cdc_crdt_race_df = cdc_up_to_crdt_race_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_crdt_race_df.reset_index(inplace=True)
nyt_crdt_counts_df.set_index('state_fips_code', inplace=True)
cdc_crdt_race_df.set_index('state_fips_code', inplace=True)
cdc_crdt_race_df_composite = nyt_crdt_counts_df.join(cdc_crdt_race_df, on="state_fips_code", how='inner', lsuffix='', rsuffix='_right')
nyt_crdt_counts_df.reset_index(inplace=True)
cdc_crdt_race_df_composite.reset_index(inplace=True)
cdc_crdt_race_df_composite['percent'] = round(cdc_crdt_race_df_composite.crdt_cases / cdc_crdt_race_df_composite.nyt_cases, 2)
cdc_crdt_race_df_composite['cases_max_100_percent'] = cdc_crdt_race_df_composite.percent.clip(upper=1)
cdc_crdt_race_df_composite['composite_percent'] = cdc_crdt_race_df_composite.cases_max_100_percent * cdc_crdt_race_df_composite.crdt_known_race_cases_percent
crdt_composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CRDT percent of NYT total cases'},
'x': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'CRDT percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CRDT percent of NYT total with race/ethnicity'},
}
crdt_composite_title = 'CRDT Percent of NYT Cases with Race/Ethnicity up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_crdt_race_df_composite, crdt_composite_fields_dict, crdt_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_crdt_race_df_composite, 'composite_percent')
#@title
cdc_nyt_states_race_df = cdc_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_race_df.reset_index(inplace=True)
cdc_nyt_states_race_df['percent'] = round(cdc_nyt_states_race_df.cdc_cases / cdc_nyt_states_race_df.nyt_cases, 2)
cdc_nyt_states_race_df['cases_max_100_percent'] = cdc_nyt_states_race_df.percent.clip(upper=1)
cdc_nyt_states_race_df['composite_percent'] = cdc_nyt_states_race_df.cases_max_100_percent * cdc_nyt_states_race_df.cdc_known_cases_percent
composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC percent of NYT total cases'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC percent of NYT total with race/ethnicity'},
}
composite_title = 'CDC Percent of NYT Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_nyt_states_race_df, composite_fields_dict, composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_states_race_df, 'composite_percent')
#@title
cdc_public_geo_nyt_states_race_df = cdc_public_geo_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_public_geo_nyt_states_race_df.reset_index(inplace=True)
cdc_public_geo_nyt_states_race_df['percent'] = round(cdc_public_geo_nyt_states_race_df.cdc_public_geo_cases / cdc_public_geo_nyt_states_race_df.nyt_cases, 2)
cdc_public_geo_nyt_states_race_df['cases_max_100_percent'] = cdc_public_geo_nyt_states_race_df.percent.clip(upper=1)
cdc_public_geo_nyt_states_race_df['composite_percent'] = (
cdc_public_geo_nyt_states_race_df.cases_max_100_percent *
cdc_public_geo_nyt_states_race_df.cdc_public_geo_known_cases_percent)
public_geo_composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC Public Geo percent of NYT total cases'},
'x': {'name': 'cdc_public_geo_known_cases_percent', 'format': '.0%', 'title': 'CDC Public Geo percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC Public Geo percent of NYT total with race/ethnicity'},
}
public_geo_composite_title = 'CDC Public Geo Percent of NYT Cases with Race/Ethnicity up to %s' % DATE_DISPLAY_NAME
state_composite_map = CreateScatterPlotAndMap(
cdc_public_geo_nyt_states_race_df, public_geo_composite_fields_dict, public_geo_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'state', 'percent'
)
state_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_public_geo_nyt_states_race_df, 'composite_percent')
Notes:
To compare these three datasets, we can look at how many states have a composite completeness measure of at least 50% and at least 85%.
#@title
# Manually update these fields based on the latest CDC data.
states_row_names = [
'Number of states with composite >= 50%',
'(as a percent of all states + D.C.)',
'Number of states with composite >= 85%',
'(as a percent of all states + D.C.)',
]
states_crdt_metadata = [
'49',
'(96%)',
'16',
'(31%)',
]
states_cdc_metadata = [
'28',
'(55%)',
'2',
'(4%)',
]
states_cdc_public_geo_metadata = [
'17',
'(33%)',
'0',
'(0%)',
]
table_data = {'CRDT': states_crdt_metadata, 'CDC Restricted': states_cdc_metadata, 'CDC Public Geo': states_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=states_row_names)
metadata_df.head(15)
If we require that states or counties have 50% of the total expected cases with race/ethnicity, we can use 49 states from CRDT, 28 states from CDC Restricted, and only 17 states from CDC Public Geo. If we tighten that requirement to 85% of total expected cases with race/ethnicity, we can only use 16 states from CRDT, two states from CDC Restricted, and no states from CDC Public Geo.
We can look at the same scatterplots and maps at the county level for the two CDC datasets.
#@title
cdc_nyt_counties_race_df['cases_max_100_percent'] = cdc_nyt_counties_race_df.percent.clip(upper=1)
cdc_nyt_counties_race_df['composite_percent'] = cdc_nyt_counties_race_df.cases_max_100_percent * cdc_nyt_counties_race_df.cdc_known_cases_percent
county_composite_map = CreateScatterPlotAndMap(
cdc_nyt_counties_race_df, composite_fields_dict, composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'county', 'percent'
)
county_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
# Mind the total county numbers for the denominator.
#PrintSummaryStats(cdc_nyt_counties_race_df, field='composite_percent')
#greater_than_85_df = cdc_nyt_counties_race_df[cdc_nyt_counties_race_df['composite_percent'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = cdc_nyt_counties_race_df[cdc_nyt_counties_race_df['composite_percent'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#print('total counties: 3143')
#@title
cdc_public_geo_nyt_counties_race_df = cdc_public_geo_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_public_geo_nyt_counties_race_df = cdc_public_geo_nyt_counties_race_df.reset_index()
cdc_public_geo_nyt_counties_race_df['percent'] = round(cdc_public_geo_nyt_counties_race_df.cdc_public_geo_cases / cdc_public_geo_nyt_counties_race_df.nyt_cases, 2)
cdc_public_geo_nyt_counties_race_df['cases_max_100_percent'] = cdc_public_geo_nyt_counties_race_df.percent.clip(upper=1)
cdc_public_geo_nyt_counties_race_df['composite_percent'] = (
cdc_public_geo_nyt_counties_race_df.cases_max_100_percent *
cdc_public_geo_nyt_counties_race_df.cdc_public_geo_known_cases_percent)
county_composite_map = CreateScatterPlotAndMap(
cdc_public_geo_nyt_counties_race_df, public_geo_composite_fields_dict, public_geo_composite_title, 1, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH - 5, 'county', 'percent'
)
county_composite_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#PrintSummaryStats(cdc_public_geo_nyt_counties_race_df, field='composite_percent')
#greater_than_85_df = cdc_public_geo_nyt_counties_race_df[cdc_public_geo_nyt_counties_race_df['composite_percent'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = cdc_public_geo_nyt_counties_race_df[cdc_public_geo_nyt_counties_race_df['composite_percent'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#print('total counties: 3143')
Notes:
#@title
# Manually update these fields based on the latest CDC data.
counties_row_names = [
'Number of counties with composite >= 50%',
'(as a percent of all counties)',
'Number of counties with composite >= 85%',
'(as a percent of all counties)',
'Population in counties with composite >= 50%',
'(as a % of total U.S population – States + D.C.)',
'Population in counties with composite >= 85%',
'(as a % of total U.S population – States + D.C.)',
]
counties_crdt_metadata = [
'-',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
]
counties_cdc_metadata = [
'1,552',
'(49%)',
'153',
'(5%)',
'176M',
'(54%)',
'10M',
'(3%)',
]
counties_cdc_public_geo_metadata = [
'535',
'(17%)',
'2',
'(0%)',
'87M',
'(26%)',
'91K',
'(0%)',
]
table_data = {'CRDT': counties_crdt_metadata, 'CDC Restricted': counties_cdc_metadata, 'CDC Public Geo': counties_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=counties_row_names)
metadata_df.head(15)
At the county level, the Restricted Access dataset is more complete on all measures. If we require that counties have only 50% of the total expected cases with race/ethnicity, the Restricted Access dataset meets that bar for 49% of all counties, and the Public Geo dataset meets that bar for only 17% of all counties. If we tighten that requirement to 85% of total expected cases with race/ethnicity, we can only use 5% of all counties from CDC Restricted and 0% of all counties from CDC Public Geo.
The data completeness has improved over time for some states while others have stayed less complete. Below we compare the composite data completeness by state over time from June 2020, when state information was added to the dataset, to April 2021.
#@title
composite_fields_dict = {
'y': {'name': 'cases_max_100_percent', 'format': '.0%', 'title': 'CDC percent of NYT total cases'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'composite_percent', 'format': '.0%', 'title': 'Composite: CDC percent of NYT total with race/ethnicity'},
}
monthly_composite_title = 'CDC Data Completeness: %s'
monthly_state_dfs = []
monthly_state_maps = []
OFFSET_FOR_MAY_2020 = 1
for i, table in enumerate(CDC_TABLE_HISTORY[OFFSET_FOR_MAY_2020:]):
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY_STR % table, 'cdc_')
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY_STR % table)
cdc_states_race_df.reset_index(inplace=True)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY_STR % DATE_UP_TO_HISTORY[i + OFFSET_FOR_MAY_2020])
cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
cdc_nyt_states_race_df = cdc_states_race_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_race_df.reset_index(inplace=True)
cdc_nyt_states_race_df['percent'] = round(cdc_nyt_states_race_df.cdc_cases / cdc_nyt_states_race_df.nyt_cases, 2)
cdc_nyt_states_race_df['cases_max_100_percent'] = cdc_nyt_states_race_df.percent.clip(upper=1)
cdc_nyt_states_race_df['composite_percent'] = cdc_nyt_states_race_df.cases_max_100_percent * cdc_nyt_states_race_df.cdc_known_cases_percent
monthly_state_dfs.append(cdc_nyt_states_race_df)
cdc_states_composite_map = CreateMap(
cdc_nyt_states_race_df, composite_fields_dict, monthly_composite_title % DATE_YEAR_DISPLAY_NAME_HISTORY[i + OFFSET_FOR_MAY_2020],
TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2 / 3 - 10, MAP_WIDTH * 2 / 3 - 10, 'state', 'percent'
)
monthly_state_maps.append(cdc_states_composite_map)
#@title
for i in range(0, len(monthly_state_maps) - 3, 3):
(monthly_state_maps[i] | monthly_state_maps[i + 1] | monthly_state_maps[i + 2]).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT * 2 / 3 - 50
).display()
# Last row
(monthly_state_maps[-3] | monthly_state_maps[-2] | monthly_state_maps[-1]).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT * 2 / 3 - 50
).display()
Another way of looking at this data is to track the state-level averages over time for the percent of the NYT total cases, percent of cases with race/ethnicity, and the composite percent.
composite_median_list = []
for df in monthly_state_dfs:
composite_median_list.append(round(df.composite_percent.mean(), 2))
nyt_percent_median_list = []
for df in monthly_state_dfs:
nyt_percent_median_list.append(round(df.cases_max_100_percent.mean(), 2))
race_percent_median_list = []
for df in monthly_state_dfs:
race_percent_median_list.append(round(df.cdc_known_cases_percent.mean(), 2))
#@title
CreateLineChart('CDC Restricted Access Dataset Completeness: State-Level Averages by Release Date',
DATE_YEAR_HISTORY[OFFSET_FOR_MAY_2020:],
race_percent_median_list,
nyt_percent_median_list,
composite_median_list,
).display()
The average state-level percent of cases with race/ethnicity increased from 46% in June 2020 to 62% in April 2021. The composite measure of the percent of NYT total cases with race/ethnicity increased from 39% in June 2020 to 49% in April 2021. While some states' data still remain incomplete, we can see that many states have slowly and gradually improved over time.
We didn't show data for territories in the maps above due to limitations in the map visualizations. Here are the details on data completeness for the territories included in the CDC Restricted Access Dataset. Note that the dataset didn't include any cases in American Samoa.
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY, include_territories=True)
cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
cdc_nyt_states_df.set_index('state', inplace=True)
cdc_states_race_df = CreateCDCStateRaceDataframe(CDC_STATES_RACE_QUERY, 'cdc_', include_territories=True)
cdc_states_race_df.set_index('state', inplace=True)
row_names = [
'CDC cases',
'NYT cases',
'(CDC as a % of NYT cases)',
'Cases with race/ethnicity',
'(% with race/ethnicity)',
'(composite % of NYT total with race/ethnicity)'
]
cdc_cases = [cdc_nyt_states_df.loc[territory].cdc_cases for territory in TERRITORIES[:-1]]
cdc_cases = [float(i) for i in cdc_cases]
nyt_cases = [cdc_nyt_states_df.loc[territory].nyt_cases for territory in TERRITORIES[:-1]]
nyt_cases = [float(i) for i in nyt_cases]
nyt_percent = [str(round(cdc_nyt_states_df.loc[territory].percent * 100)) + '%' for territory in TERRITORIES[:-1]]
race_ethnicity_cases = [cdc_states_race_df.loc[territory].cdc_known_cases for territory in TERRITORIES[:-1]]
race_ethnicity_percent = [str(round(cdc_states_race_df.loc[territory].cdc_known_cases_percent * 100)) + '%' for territory in TERRITORIES[:-1]]
composite_percent = [str(round(cdc_nyt_states_df.loc[territory].percent * cdc_states_race_df.loc[territory].cdc_known_cases_percent * 100)) + '%' for territory in TERRITORIES[:-1]]
territories_data = list(zip(cdc_cases,
nyt_cases,
nyt_percent,
race_ethnicity_cases,
race_ethnicity_percent,
composite_percent))
table_data = {'Puerto Rico': territories_data[0], 'Guam': territories_data[1], 'Virgin Islands': territories_data[2], 'Northern Mariana Islands': territories_data[3]}
territories_df = pd.DataFrame(table_data, index=row_names)
pd.options.display.float_format = '{:,.0f}'.format
territories_df.head(n=10)
The additional fields in the dataset are quite incomplete; whether the person died is known for 52% of cases, whether they were hospitalized is known for 40% of cases, the symptom onset date is known for 47% of cases, and all other fields are known for 27% or fewer of all cases.
#@title
field_list = ['death_yn', 'hosp_yn', 'icu_yn', 'onset_dt', 'pos_spec_dt', 'hc_work_yn',
'pna_yn', 'abxchest_yn', 'acuterespdistress_yn', 'mechvent_yn', 'fever_yn', 'sfever_yn', 'chills_yn', 'myalgia_yn', 'runnose_yn',
'sthroat_yn', 'cough_yn', 'sob_yn', 'nauseavomit_yn', 'headache_yn', 'abdom_yn', 'diarrhea_yn', 'medcond_yn']
_, _, bar_chart = FieldAnalysis(PROJECT_ID, [CDC_TABLE], field_list, 'Least Complete Fields in the CDC Restricted Access Dataset')
bar_chart.display()
The CDC also commented on these fields in their case data FAQs:
Because of the volume of cases, most health departments are unable to conduct investigations of every case to obtain additional information. Because of this, most case reports are missing data on patient demographics, symptoms, underlying health conditions, characteristics of hospitalizations such as ventilator use, and other factors such as recent travel history.
The case report form contains many more fields, but unfortunately, the fields get more incomplete as you go down the form. Citizens for Responsibility and Ethics in Washington (CREW) obtained a version of this data via FOIA that contained 101 fields with data up to Aug 25, 2020 and shared it with MSM/SHLI. Several of the additional fields from that dataset are shown below; the field with the most known values is whether the case was associated with an outbreak, but even that is only known for 30% of cases.
#@title
field_list = ['death_week', 'icu_length', 'hosp_length', 'translator_yn', 'housing', 'exp_work_critical', 'outbreak_associated',
'rigors_yn', 'taste_yn', 'fatigue_yn', 'wheezing_yn', 'diffbreathing_yn', 'chestpain_yn', 'test_pcr', 'test_serologic',
'exp_adultfacility', 'exp_airport', 'exp_animal', 'exp_community', 'exp_gathering', 'exp_contact', 'exp_correctional',
'exp_ship', 'exp_house', 'exp_other', 'exp_school', 'exp_othcountry', 'exp_unk', 'exp_work']
project_id = 'msm-internal-data'
_, _, bar_chart = FieldAnalysis(PROJECT_ID, [CREW_TABLE], field_list, 'Additional Fields in the FOIA\'d CREW/CDC Dataset')
bar_chart.display()
We combined the race and ethnicity fields following the CDC's guidance on the logic they used to combine the fields in their datasets.
#@title
row_names = [
'American Indian / Alaska Native',
'Asian',
'Black',
'Native Hawaiian / Pacific Islander',
'White',
'Multiple/Other',
'Unknown',
'Missing'
]
hispanic = ['Hispanic/Latino'] * 8
non_hispanic = [
'American Indian/Alaska Native, Non-Hispanic',
'Asian, Non-Hispanic',
'Black, Non-Hispanic',
'Native Hawaiian/Pacific Islander, Non-Hispanic',
'White, Non-Hispanic',
'Multiple/Other, Non-Hispanic',
'Unknown',
'Unknown'
]
unknown = ['Unknown'] * 8
missing = [
'Unknown',
'Unknown',
'Unknown',
'Unknown',
'Unknown',
'Unknown',
'Unknown',
'Missing'
]
table_data = {'Non-Hispanic/Latino': non_hispanic, 'Hispanic/Latino': hispanic, 'Unknown': unknown, 'Missing': missing}
race_ethnicity_combined_df = pd.DataFrame(table_data, index=row_names)
race_ethnicity_combined_df.head(n=10)
The CDC Restricted dataset includes a county_fips_code field with a unique identifier for each county. However, we used a different mapping to county FIPS codes due to data quality issues. When we used the county_fips_code field provided in the CDC dataset, 113K records with known state and county values had no county_fips_code, including 52K cases in Long Beach, CA (part of Los Angeles county) and nearly all 45K cases in D.C. We created a mapping using the American Community Survey (ACS) 2019 5-year estimates data and then modified the mapping to handle cases of misspellings and other issues in the CDC dataset. We documented the changes and the new mapping in this spreadsheet.
With the new mapping, we now match all but 546 cases with known state and county values to county FIPS codes. We also identified 84 non-existent state-county combinations listed here that the CDC file was matching to county_fips_codes for 885 cases. We no longer match them to any county_fips_codes, but we do report them in the state-level data for that state.
One geographical exception is that the NYT combined the five separate counties that make up New York City into one geographic unit:
Another geographical exception is that the NYT combined four boroughs (county equivalents) in Alaska as two combined boroughs:
We excluded all of these cases from the county-level maps above.
The COVID Data Tracker dashboard is the easiest way to see U.S.-level data with race/ethnicity breakdowns for cases, deaths, and over time. The COVID Data Tracker dashboard lists the Public Use case surveillance data as the underlying data source, but there are some small differences. For example, if we compare a snapshot of the COVID Data Tracker dashboard from April 15, the COVID Data Tracker contains 24.3M cases where 61% have race/ethnicity, while the case surveillance data up to April 15 has 24.4M cases where 65% have race/ethnicity. One possible explanation is that more information about race/ethnicity came into the CDC between April 16 and April 30 when the case surveillance datasets were released.
Here is the full comparison between the CRDT, CDC Restricted, and CDC Public Geo datasets. Parts of this table appear in the paper above; this consolidates them into all one place for a full overview.
#@title
table_data = {'CRDT': overall_crdt_metadata + (len(num_counties_cdc_metadata) - 3) * ['-'] + states_crdt_metadata + counties_crdt_metadata,
'CDC Restricted': overall_cdc_metadata + num_counties_cdc_metadata[3:] + states_cdc_metadata + counties_cdc_metadata,
'CDC Public Geo': overall_cdc_public_geo_metadata + num_counties_cdc_public_geo_metadata[3:] + states_cdc_public_geo_metadata + counties_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=overall_row_names + num_counties_row_names[3:] + states_row_names + counties_row_names)
metadata_df.head(25)
To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.
#@title
CreateMap(
cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'ratio'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
cdc_nyt_counties_race_df, cdc_race_fields_dict, cdc_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
cdc_public_geo_counties_race_df, cdc_public_geo_race_fields_dict, cdc_public_geo_counties_race_title, TOTAL_CASES_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
cdc_public_geo_counties_race_df, cdc_public_geo_compare_race_fields_dict, cdc_public_geo_compare_race_title, COUNTY_CASES_RACE_SCALE_MAX, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'ratio'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).display()
#@title
CreateMap(
cdc_nyt_counties_race_df, composite_fields_dict, composite_title, 1, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
#@title
CreateMap(
cdc_public_geo_nyt_counties_race_df, public_geo_composite_fields_dict, public_geo_composite_title, 1, MAP_HEIGHT * 2, MAP_WIDTH * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=MAP_HEIGHT - 50
).configure_mark(
stroke='grey'
).display()
Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.
#@title
#%%shell
#jupyter nbconvert --to html 'cdc_case_data.ipynb'