How complete are the CDC's COVID-19 Case Surveillance datasets for race/ethnicity at the state and county levels?

May 24, 2021

In [1]:
#@title
# How to update the data:
# 1. You may need to copy this colab so you have your own version.
# 2. Update the history constants below to have the latest data's suffix.
# 3. Update the date history constants below to be the last case date included in the data.
# 4. Updates the scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
#    Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
#    the paper including text and tables, a lot of that is done in commented out PrintSummaryStats() statements.

import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
pd.options.display.float_format = '{:,.2f}'.format

# Table name suffixes.
DATE_HISTORY = [
    '20200529',
    '20200627',
    '20200731',
    '20200831',
    '20200930',
    '20201031',
    '20201204',
    '20201231',
    '20210131',
    '20210228',
    '20210331',
    '20210430',
]

# Cases up to these dates for querying NYT case counts.
DATE_UP_TO_HISTORY = [
    'DATE(2020, 05, 14)',
    'DATE(2020, 06, 12)',
    'DATE(2020, 07, 16)',
    'DATE(2020, 08, 17)',
    'DATE(2020, 09, 15)',
    'DATE(2020, 10, 16)',
    'DATE(2020, 11, 19)',
    'DATE(2020, 12, 16)',
    'DATE(2021, 01, 16)',
    'DATE(2021, 02, 13)',
    'DATE(2021, 03, 16)',
    'DATE(2021, 04, 15)',
]

# Cases up to these dates for charts.
DATE_UP_TO_DISPLAY_NAME_HISTORY = [
    'May 14',
    'Jun 12',
    'Jul 16',
    'Aug 17',
    'Sep 15',
    'Oct 16',
    'Nov 19',
    'Dec 16',
    'Jan 16',
    'Feb 13',
    'Mar 16',
    'Apr 15',
]

# Dates for Altair line charts; the 1st of the month formatted to the previous day.
DATE_YEAR_HISTORY = [
    '2020-05-28',
    '2020-06-28',
    '2020-07-28',
    '2020-08-28',
    '2020-09-28',
    '2020-10-28',
    '2020-11-28',
    '2020-12-28',
    '2021-01-28',
    '2021-02-28',
    '2021-03-28',
    '2021-04-28',
]

# Dates for Altair bar charts, which were less particular than the line charts.
DATE_YEAR_DISPLAY_NAME_HISTORY = [
    'May 2020',
    'Jun 2020',
    'Jul 2020',
    'Aug 2020',
    'Sep 2020',
    'Oct 2020',
    'Nov 2020',
    'Dec 2020',
    'Jan 2021',
    'Feb 2021',
    'Mar 2021',
    'Apr 2021',
]

# Project and table names.
PROJECT_ID = 'msm-secure-data-1b'

def FullTableName(date):
  return '`%s.ndunlap_secure.cdc_restricted_access_%s`' % (PROJECT_ID, date)

CDC_TABLE_HISTORY = [FullTableName(date) for date in DATE_HISTORY]
CDC_TABLE = CDC_TABLE_HISTORY[-1]
CDC_PUBLIC_GEO_TABLE = '`%s.ndunlap_secure.cdc_public_use_with_geo_%s`' % (PROJECT_ID, DATE_HISTORY[-1])
CRDT_TABLE = '`%s.ndunlap_secure.crdt_20210307`' % PROJECT_ID
CREW_TABLE = '`msm-internal-data.crew.covid_case_surveillance`'

# Dates in different formats.
DATE = DATE_UP_TO_HISTORY[-1]
DATE_DISPLAY_NAME = DATE_UP_TO_DISPLAY_NAME_HISTORY[-1]
CRDT_DATE = '20210307'
CRDT_COMPARE_DATE = 'DATE(2021, 03, 07)'
CRDT_COMPARE_DATE_DISPLAY_NAME = 'Mar 7'
TOTAL_CRDT_CASES = 28756184	 # from https://covidtracking.com/data/national

# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
TOTAL_CASES_SCALE_MAX = 4000000
COUNTY_CASES_SCALE_MAX = 1400000
COUNTY_CASES_ZOOM_SCALE_MAX = 200000
CASES_RACE_SCALE_MAX = 3000000 # known race/ethnicity
COUNTY_CASES_RACE_SCALE_MAX = 900000 # known race/ethnicity

# Chart settings.
SCATTER_HEIGHT = 300
SCATTER_WIDTH = 300
MAP_HEIGHT = 300
MAP_WIDTH = 450
US_STATES_TOPO = alt.topo_feature(data.us_10m.url, 'states')
US_COUNTIES_TOPO = alt.topo_feature(data.us_10m.url+"#", 'counties')

TERRITORIES = ('PR', 'GU', 'VI', 'MP', 'AS')
NYT_TERRITORIES = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
STATES_TO_FIPS = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12,
                  'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23,
                  'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34,
                  'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45,
                  'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56,
                  'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78, 'USVI': 78, 'NYC': 36}
FIPS_TO_STATES = {STATES_TO_FIPS[key]: key for key in STATES_TO_FIPS}
RACE_ETHNICITY_COMBINED_MAP = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}
RACE_ETHNICITY_COMBINED_KNOWN_MAP = {
   'Asian, Non-Hispanic': 'cdc_known_cases',
   'Black, Non-Hispanic': 'cdc_known_cases',
   'White, Non-Hispanic': 'cdc_known_cases',
   'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
   'Hispanic/Latino': 'cdc_known_cases',
   'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
   'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
   'Missing': 'cdc_unknown_cases',
   'Unknown': 'cdc_unknown_cases',
   'NA': 'cdc_na_cases',
}
In [2]:
#@title
CDC_OVERALL_RACE_QUERY = ('''
SELECT
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
GROUP BY
   1
''' % CDC_TABLE)

NYT_US_QUERY_STR = ('''
SELECT
  cases as nyt_cases,
  FROM `msm-secure-data-1b.ndunlap_secure.nyt_us`
  WHERE
  date = %s
''')

CRDT_QUERY = ('''
SELECT
  State as state,
  Cases_Total as crdt_cases,
  Cases_Total - Cases_Unknown as crdt_known_race_cases,
  ROUND(1 - Cases_Unknown / Cases_Total, 4) as crdt_known_race_cases_percent,
  Cases_Total - Cases_Ethnicity_Unknown as crdt_known_ethnicity_cases,
  ROUND(1 - Cases_Ethnicity_Unknown / Cases_Total, 4) as crdt_known_ethnicity_cases_percent,
FROM %s
WHERE
  date = %s
''' % (CRDT_TABLE, CRDT_DATE))

NYT_STATES_QUERY_STR = ('''
SELECT
  state_name,
  state_fips_code,
  confirmed_cases as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''')

NYT_STATES_QUERY = NYT_STATES_QUERY_STR % DATE
NYT_STATES_COMPARE_CRDT_QUERY = NYT_STATES_QUERY_STR % CRDT_COMPARE_DATE

CDC_STATES_QUERY_STR = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_state
''')

CDC_STATES_QUERY = CDC_STATES_QUERY_STR % CDC_TABLE

NYT_COUNTIES_QUERY = ('''
SELECT
  county_fips_code,
  confirmed_cases as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s AND
  county_fips_code IS NOT NULL
''' % DATE)

CDC_COUNTIES_RACE_QUERY = ('''
SELECT
  res_state,
  res_county,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % CDC_TABLE)

COUNTY_FIPS_MAPPING_QUERY = ('''
SELECT
*
FROM
  `msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')

ACS_POPULATION_DATA_QUERY = ('''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''')

CDC_STATES_RACE_QUERY_STR = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_state,
   race_ethnicity_combined
''')

CDC_STATES_RACE_QUERY = CDC_STATES_RACE_QUERY_STR % CDC_TABLE

CDC_STATES_RACE_UP_TO_CRDT_QUERY = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  cdc_case_earliest_dt <= %s
GROUP BY
   res_state,
   race_ethnicity_combined
''' % (CDC_TABLE, CRDT_COMPARE_DATE))

CDC_PUBLIC_GEO_STATES_QUERY = ('''
SELECT
  res_state,
  COUNT(*) as cdc_public_geo_cases
FROM
  %s
GROUP BY
   res_state
''' % CDC_PUBLIC_GEO_TABLE)

RACE_ETHNICITY_COMBINED_STR = '''
  CASE
    WHEN ethnicity = "Non-Hispanic/Latino" AND race != "Missing" THEN race
    WHEN ethnicity = "Hispanic/Latino" THEN ethnicity
    WHEN ethnicity = "NA" THEN "NA"
    WHEN ethnicity = "Missing" AND race = "Missing" THEN "Missing"
    ELSE "Unknown"
  END
'''

CDC_PUBLIC_GEO_COUNTIES_RACE_QUERY = ('''
SELECT
  res_state,
  res_county,
  %s as race_ethnicity_combined,
  COUNT(*) as cdc_public_geo_cases
FROM
  %s
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % (RACE_ETHNICITY_COMBINED_STR, CDC_PUBLIC_GEO_TABLE))

CDC_PUBLIC_GEO_STATES_RACE_QUERY = ('''
SELECT
  res_state,
  %s as race_ethnicity_combined,
  COUNT(*) as cdc_public_geo_cases
FROM
  %s
GROUP BY
   res_state,
   race_ethnicity_combined
''' % (RACE_ETHNICITY_COMBINED_STR, CDC_PUBLIC_GEO_TABLE))
In [3]:
#@title
# This either works for a list of fields and one table or a list of tables and one field. 
def FieldAnalysis(project_id, table_list, field_list, title, calculate_race_ethnicity=False):
  field_dict = {}
  if len(field_list) > 1:
    list_for_iteration = field_list
    table = table_list[0]
  else:
    list_for_iteration = table_list
    field = field_list[0]
  for item in list_for_iteration:
    field_dict[item] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(field_dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_known_history = []
  total_count_history = []
  field_series = []
  value_series = []
  percent_series = []
  cases_series = []
  chart_denominator = 1000000

  field_display_name = {
    'cdc_case_earliest_dt': 'CDC earliest case date',
    'current_status': 'Case status',
    'case_month': 'Case month',
    'res_state': 'State',
    'res_county': 'County',
    'sex': 'Sex',
    'age_group': 'Age',
    'race': 'Race',
    'ethnicity': 'Ethnicity',
    'race_ethnicity_combined': 'Race/Ethnicity',
  }
  for i, item in enumerate(list_for_iteration):
    if len(field_list) > 1:
      field = item
    else:
      table = item    
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    GROUP BY
      %s
    ''')
    if (field == 'race_ethnicity_combined' and (
        calculate_race_ethnicity or item == CDC_TABLE_HISTORY[0])):
      field_unknowns_query = ('''
      SELECT ''' + RACE_ETHNICITY_COMBINED_STR + ''' 
        as %s,
        count(*) as cases
      FROM
        %s
      GROUP BY
        %s
      ''')
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    if '' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc[''].cases
    if 'OTH' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['OTH'].cases
    if 'nul' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['nul'].cases
    unknowns.loc['Missing', field] = missing_count

    unknown_count = 0
    if 'Unknown' in field_unknowns_df.index:
      unknown_count += field_unknowns_df.loc['Unknown'].cases
    unknowns.loc['Unknown', field] = unknown_count
    na_count = 0
    if 'NA' in field_unknowns_df.index:
      na_count += field_unknowns_df.loc['NA'].cases
    unknowns.loc['NA', field] = na_count
    unknowns.loc['Known', field] = field_unknowns_df.cases.sum() - (
        unknowns.loc['Missing', field] +
        unknowns.loc['Unknown', field] +
        unknowns.loc['NA', field])
    if len(field_list) > 1:
      field_series.extend([field_display_name.get(field, field)] * 4)
    else:
      field_series.extend([DATE_YEAR_DISPLAY_NAME_HISTORY[i]] * 4)
      field_known_history.append(unknowns.loc['Known', field] / field_unknowns_df.cases.sum())
      total_count_history.append(field_unknowns_df.cases.sum())
    value_series.extend(['Known', 'Suppressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['NA', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['Unknown', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['Missing', field] / field_unknowns_df.cases.sum()])
    cases_series.extend([unknowns.loc['Known', field] / chart_denominator,
                           unknowns.loc['NA', field] / chart_denominator,
                           unknowns.loc['Unknown', field] / chart_denominator,
                           unknowns.loc['Missing', field] / chart_denominator])
    bars_df = pd.DataFrame.from_dict({'field': field_series,
                                    'value': value_series,
                                   'percent': percent_series,
                                   'cases': cases_series})
  chart = alt.Chart(bars_df).mark_bar().encode(
      x=alt.X('percent:Q', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field:N', title='Field', sort=field_list),
      color=alt.Color('value:N', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
      ]
  ).properties(title=title)
  return total_count_history, field_known_history, chart

def CreateNYTStateDataframe(query, include_territories=False):
  nyt_states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  if not include_territories:
    for territory in NYT_TERRITORIES:
     nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
  nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
  nyt_states_df.set_index('state_fips_code', inplace=True)
  return nyt_states_df

def CreateCDCStateDataframe(query):
  states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
    states_df = states_df[states_df.res_state != state]
  states_df.rename(columns={'res_state': 'state'}, inplace=True)
  states_df['state_fips_code'] = states_df.state
  states_df = states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
  states_df['state_fips_code'] = states_df.state_fips_code.astype(int)
  states_df.set_index('state_fips_code', inplace=True)
  return states_df

def CreateCDCStateRaceDataframe(query, cases_field_prefix, include_territories=False):
  states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
    states_df = states_df[states_df.res_state != state]

  states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
  states_df = states_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_KNOWN_MAP})
  states_df.rename(columns={'res_state': 'state'}, inplace=True)

  cases_field = cases_field_prefix + 'cases' 
  crosstab_df = pd.crosstab(states_df['state'],
                            states_df.race_ethnicity_combined,
                            values=states_df[cases_field],
                            aggfunc=sum,
                            margins=True,
                            margins_name=cases_field
  )
  # Have to reset_index() to go from pandas multi-index to single index.
  crosstab_df = crosstab_df.reset_index()
  crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
  crosstab_df[cases_field_prefix + 'known_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
  crosstab_df[cases_field_prefix + 'known_or_na_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_unknown_cases.fillna(0)
  crosstab_df[cases_field_prefix + 'known_cases_percent'] = round(
      crosstab_df[cases_field_prefix + 'known_cases'] /
      crosstab_df[cases_field], 4)
  crosstab_df[cases_field_prefix + 'known_or_na_cases_percent'] = round(
      crosstab_df[cases_field_prefix + 'known_or_na_cases'] /
      crosstab_df[cases_field], 4)
  crosstab_df['state_fips_code'] = crosstab_df.state
  crosstab_df = crosstab_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})

  # Remove territories and missing states for calculating summary stats.
  if not include_territories:
    for territory in TERRITORIES:
      crosstab_df = crosstab_df[crosstab_df.state != territory]
  crosstab_df = crosstab_df[crosstab_df.state != 'NA']
  crosstab_df = crosstab_df[crosstab_df.state != 'Missing']
  crosstab_df = crosstab_df[crosstab_df.state != 'Unknown']
  crosstab_df.set_index('state_fips_code', inplace=True)
  return crosstab_df

def CreateCRDTStateRaceDataframe(query):
  crdt_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for territory in TERRITORIES:
    crdt_df = crdt_df[crdt_df.state != territory]
  crdt_df['state_fips_code'] = crdt_df.state
  crdt_df = crdt_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
  crdt_df['state_fips_code'] = crdt_df.state_fips_code.astype(int)
  crdt_df.set_index('state_fips_code', inplace=True)
  return crdt_df

def CreateNYTCountyDataframe(query):
  nyt_counties_df = pd.io.gbq.read_gbq(NYT_COUNTIES_QUERY, project_id=PROJECT_ID)
  nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
  nyt_counties_df.county_fips.unique()
  nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
  nyt_counties_df.set_index('county_fips', inplace=True)
  return nyt_counties_df

def CreateCDCCountyRaceDataframe(query, cases_field_prefix):
  cdc_counties_race_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for territory in TERRITORIES:
    cdc_counties_race_df = cdc_counties_race_df[cdc_counties_race_df.res_state != territory]

  county_fips_map_df = pd.io.gbq.read_gbq(COUNTY_FIPS_MAPPING_QUERY, project_id=PROJECT_ID)

  county_fips_map_df.cdc_county = county_fips_map_df.cdc_county.str.lower()
  county_fips_map_df['state_county'] = county_fips_map_df.state + '-' + county_fips_map_df.cdc_county
  county_fips_map_df['state_county'] = county_fips_map_df.state_county.astype('string').str.strip()
  county_fips_map_df.set_index('state_county', inplace=True)

  # Concatenate the state and county names because county names are not unique across states.
  cdc_counties_race_df.res_county = cdc_counties_race_df.res_county.str.lower()
  cdc_counties_race_df['state_county'] = cdc_counties_race_df.res_state + '-' + cdc_counties_race_df.res_county
  cdc_counties_race_df['state_county'] = cdc_counties_race_df.state_county.astype('string').str.strip()
  cdc_counties_race_df.set_index('state_county', inplace=True)
  cdc_counties_race_df['race_ethnicity_combined'] = cdc_counties_race_df.race_ethnicity_combined.astype('string').str.strip()
  cdc_counties_race_df = cdc_counties_race_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})

  # Printed value used in the footnotes below.
  # All other checks for county_fips_code mappings are now in
  # https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006
  mismatches_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
  mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
  mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
  mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
  mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
  mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
  mismatches_df = mismatches_df[mismatches_df.res_county != 'missing']
  # print(mismatches_df.cases.sum())

  cdc_counties_race_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')

  cases_field = cases_field_prefix + 'cases'
  # Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
  cdc_counties_race_crosstab_df = pd.crosstab(cdc_counties_race_df['county_fips'],
                                              cdc_counties_race_df.race_ethnicity_combined,
                                              values=cdc_counties_race_df[cases_field],
                                              aggfunc=sum,
                                              margins=True,
                                              margins_name=cases_field
 )
  # Have to reset_index() to go from pandas multi-index to single index.
  cdc_counties_race_crosstab_df = cdc_counties_race_crosstab_df.reset_index()
  cdc_counties_race_crosstab_df.drop(axis=0, index=len(cdc_counties_race_crosstab_df) - 1, inplace=True)
  cdc_counties_race_crosstab_df['county_fips'] = cdc_counties_race_crosstab_df.county_fips.astype(int)
  cdc_counties_race_crosstab_df[cases_field_prefix + 'known_cases'] = (
      cdc_counties_race_crosstab_df[cases_field] -
      cdc_counties_race_crosstab_df.na_cases.fillna(0) -
      cdc_counties_race_crosstab_df.unknown_cases.fillna(0))

  # Get the display names for each county.
  # Use ACS data that only has one FIPS code per county unlike the fips_county_map.
  acs_name_lookup_df = pd.io.gbq.read_gbq(ACS_POPULATION_DATA_QUERY, project_id=PROJECT_ID)

  acs_name_lookup_df['state_county'] = (acs_name_lookup_df.county.astype('string').str.strip() +
                                        ', ' + acs_name_lookup_df.state.astype('string').str.strip())
  acs_name_lookup_df.drop(columns=['state', 'county'], inplace=True)
  acs_name_lookup_df.set_index('county_fips', inplace=True)

  cdc_counties_race_df = cdc_counties_race_crosstab_df.join(acs_name_lookup_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
  cdc_counties_race_df.county_fips = cdc_counties_race_df.county_fips.astype(int)
  cdc_counties_race_df.set_index('county_fips', inplace=True)
  return cdc_counties_race_df

def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = US_STATES_TOPO
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = US_COUNTIES_TOPO

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(US_STATES_TOPO).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(US_STATES_TOPO).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def CreateLineChart(title, dates, race_ethnicity_known_history, overall_nyt_percent, overall_composite):
  line_chart_dict = {}
  line_chart_dict['date'] = dates * 3
  line_chart_dict['metric'] = (['Percent with Race/Ethnicity'] * len(dates) +
                               ['Percent of NYT total case counts'] * len(dates) +
                               ['Composite percent'] * len(dates)

  )
  line_chart_dict['value'] = (race_ethnicity_known_history +
                              overall_nyt_percent +
                              overall_composite
                              )
  line_chart_df = pd.DataFrame(line_chart_dict)
  line = alt.Chart(line_chart_df).mark_line(point=True).encode(
      x=alt.X('yearmonth(date):O', title='', axis=alt.Axis(labelAngle=0)),
      y=alt.Y('value:Q', title='', axis=alt.Axis(format='%')),
      color=alt.Color('metric', title='', scale=alt.Scale(scheme='browns'),
                      sort=['Percent of NYT total case counts',
                            'Percent with Race/Ethnicity',
                            'Composite percent',
                           ]),
  )

  nearest = alt.selection(type='single', nearest=True, on='mouseover',
                          fields=['date'], empty='none')

  # Transparent selectors across the chart. This is what tells us
  # the x-value of the cursor
  selectors = alt.Chart(line_chart_df).mark_point().encode(
      x='yearmonth(date):O',
      opacity=alt.value(0),
  ).add_selection(
      nearest
  )

  # Draw points on the line, and highlight based on selection
  points = line.mark_point().encode(
      opacity=alt.condition(nearest, alt.value(1), alt.value(0))
  )

  # Draw text labels near the points, and highlight based on selection
  text = line.mark_text(align='left', dx=7, dy=-7).encode(
      text=alt.condition(nearest, 'value:Q', alt.value(' '), format='.0%')
  )

  # Put the five layers into a chart and bind the data
  line_chart = alt.layer(
      line, selectors, points, text
  ).properties(
      title=title,
      width=550,
      height=300
  )
  return line_chart

def PrintSummaryStats(chart_df, field='percent'):
  plus_minus_15_df = chart_df[chart_df[field] >= .85]
  plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
  print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
  plus_minus_50_df = chart_df[chart_df[field] >= .50]
  plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
  print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())

Abstract

The Covid Tracking Project was the most reliable source for COVID-19 data with race/ethnicity at the state level until it stopped collecting data on March 7, 2021. The CDC's Case Surveillance Restricted Access and Public Use with Geography datasets are the only available replacements for the Covid Tracking Project's dataset, and they additionally include county-level data and age along with race/ethnicity. This paper evaluates the completeness of the CDC datasets at the state and county levels in terms of (1) the total number of cases included compared to the New York Times, and (2) the number of cases included with race/ethnicity data compared to the Covid Tracking Project.

The CDC's Restricted Access dataset contains 78% of the cases in the New York Times up to April 15, 2021, and 65% of cases have race/ethnicity information vs. 67% in the Covid Tracking Project. The dataset's completeness has steadily and gradually improved over time; e.g., the first available version from May 2020 had race/ethnicity information for only 43% of cases. At the state and county levels, the dataset's completeness has also improved with a state-level average of 62% of cases with race/ethnicity in April 2021 vs. 46% in June 2020. However, the dataset's completeness at the state level is highly variable; for example, Minnesota has 102% of the cases included in the New York Times, while Louisiana has only 4% of the cases in the New York Times. Minnesota has 91% of cases with race/ethnicity, while Louisiana has only 19% with race/ethnicity (vs. 94% in the Covid Tracking Project). Texas alone is missing 2.8M cases, accounting for more than a third of the total 7.1M missing cases. New York is missing race/ethnicity for 1.3M cases and California for 1.1M cases, accounting for more than a quarter of the 8.6M cases missing race/ethnicity when combined.

The CDC's Public Use with Geography dataset is similar to the Restricted Access dataset for total case counts, but is less complete due to more privacy suppression; e.g., only 49% of cases have race/ethnicity information.

Background

The racial and ethnic disparities in the COVID-19 pandemic have exposed longstanding health inequities in the U.S., which have been described in multiple analyses of COVID-19 data by the Covid Tracking Project, New York Times, American Public Media Research Lab, and Kaiser Family Foundation among many others. However, we still don't have a full understanding of these disparities because race/ethnicity data continue to be incomplete more than a year after the pandemic started. As the Covid Tracking Project wrote on January 29, "the continued lack of either complete federal demographic data or federal guidelines for what states should publish make it impossible to fully understand who is being infected with and dying of COVID-19."

The most reliable and up-to-date data are scattered across state and local public health websites that use different standards and categories for reporting race/ethnicity. In 2020, collecting these data and turning them into a unified dataset was largely left to non-governmental organizations like the Covid Tracking Project, which stopped collecting data on March 7, 2021, a full year after it started. Even the federal government looked to the Covid Tracking Project for reliable COVID-19 race/ethnicity data. The office of the Assistant Secretary for Planning and Evaluation, an agency within the U.S. Department of Health and Human Services, wrote in October 2020, "The volunteer-based COVID tracking project has created the most comprehensive centralized resource for race and ethnicity data at the state level."

At the county level, finding race/ethnicity data for cases was nearly impossible for most of 2020. Since November 2020, the CDC has shown total case counts at the county level in a dashboard. Before the CDC published that data, several non-governmental organizations (New York Times, Johns Hopkins University, USAFacts) gathered data for total case counts at the county level. But none of these sources collected or published race/ethnicity data, which would have been a huge undertaking due to the non-standard reporting of race/ethnicity across state and local public health websites. The only public analysis of case data with race/ethnicity at the county level was in July 2020 when the New York Times published The Fullest Look Yet at the Racial Inequity of Coronavirus. The New York Times used CDC Case Surveillance data that they obtained via FOIA and legal action to analyze cases up to May 28, 2020.

Now that the Covid Tracking Project has stopped collecting data, there is only one option for COVID-19 case data with race/ethnicity unified across U.S. states and counties: the CDC's Case Surveillance data, which are based on state and local health departments reporting cases to the CDC. There are more options for data on deaths, which we discuss in a separate deaths data report.

The CDC publishes several variations of the Case Surveillance data:

  1. Public Use dataset at the U.S. level (dashboard)
  2. Public Use with Geography dataset with state and county information (dashboard)
  3. Restricted Access dataset with state and county information

These three datasets are from the same underlying source; they have the same number of records and are released every two weeks (monthly until May 2021). The Public Use dataset was first published in May 2020. The Restricted Access dataset was first published at the link above in November 2020; the dataset itself is in a private GitHub repository that the CDC shares with people who apply for access and adhere to a data use agreement. The CDC's initial data agreement did not allow for county-level analyses to be made public, but an updated data agreement from December 14, 2020 allowed such analyses. In January 2021, the Morehouse School of Medicine's Satcher Health Leadership Institute (MSM/SHLI) in collaboration with Google.org applied for and got access to this data within a few days. Several months later in March 2021, the CDC published the Public Use with Geography dataset with state and county information, fewer fields, and more data suppression than the Restricted Access dataset. In addition to these datasets, the CDC also publishes charts showing race/ethnicity breakdowns at the U.S. level in their COVID Data Tracker; see the Appendix for a comparison of that tracker to these datasets.

The CDC's Case Surveillance Restricted Access dataset has enormous potential: It could allow us to analyze data across all states and counties to study the disparities in COVID-19 cases using consistent race/ethnicity categories. This dataset could pick up where the Covid Tracking Project's dataset left off and additionally enable the first analyses of race/ethnicity disparities at the county level since July 2020. The dataset also has age and sex for each case, so we could analyze the intersection of race/ethnicity with age and sex. The dataset has case report dates, which would allow us to look at cases over time. However, the dataset will only live up to its potential if it is complete both in terms of the number of cases included and the number of cases that have race/ethnicity.

Our analysis shows that the CDC's Case Surveillance Restricted Access dataset has two main data completeness issues:

  • Only 78% of total cases in the New York Times up to April 15 are included (7.1M out of 31.5M cases are missing)
  • Of the cases in the dataset, only 65% have known race/ethnicity (8.6M out of 24.4M cases are missing race/ethnicity)

For the 15.8M cases where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:

In [4]:
#@title
overall_df = pd.io.gbq.read_gbq(CDC_OVERALL_RACE_QUERY, project_id=PROJECT_ID)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
overall_df = overall_df.set_index('race_ethnicity_combined')

chart_denominator = 1000000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
         overall_df.cases['black_cases'] / chart_denominator,
         overall_df.cases['white_cases'] / chart_denominator,
         overall_df.cases['asian_cases'] / chart_denominator,
         overall_df.cases['nhpi_cases'] / chart_denominator,
         overall_df.cases['aian_cases'] / chart_denominator,
         overall_df.cases.sum() / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.1%'), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.2%', title='Prevalence within group'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
                  alt.Tooltip('population:Q', format=',.2f', title='Population of group (millions)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who had COVID-19 based on Incomplete CDC Data up to %s' % DATE_DISPLAY_NAME
)

alt.concat(bars).properties(
    title=alt.TitleParams(
        ['Population data source: The U.S. Census Bureau\'s American Community Survey 2019 5-year estimates.'],
        baseline='bottom',
        dy=20,
        orient='bottom',
        fontWeight='normal',
        fontSize=11
    )
).display()

The chart above is based on incomplete data. With only 78% of cases included, the total percent of people who had COVID-19 should be 9.6% instead of 7.4%. It's harder to estimate how much the race/ethnicity groups are undercounting the number of confirmed COVID-19 cases. If we added all 8.6M cases with missing race/ethnicity to the Hispanic/Latino group, the percent of Hispanic/Latinos in the U.S. who had COVID-19 would go from 7.4% to 21.7% — a 3x increase. If all 8.6M cases with missing race/ethnicity were Black people, the percent of Black people who had COVID-19 would go from 4.3% to 25.5% — a 6x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.

At the same time, the data from state public health websites are not perfect; the Covid Tracking Project only has race/ethnicity data for 67% of cases up to March 7. We'll compare data completeness at the state level in the Case Surveillance data vs. the New York Times and the Covid Tracking Project to better understand how the data completeness compares between those datasets.

Overview

The goal of this analysis is to assess the completeness of the CDC's Restricted Access and Public Use with Geography datasets, which will help evaluate their usefulness in examining disparities in race/ethnicity for COVID-19 cases at the state and county levels. We'll first focus on the Restricted Access dataset, which is more complete than the Public Use with Geography dataset due to privacy reasons.

The overall data completeness findings for the CDC's Restricted Access Dataset are:

  1. Data Overview: Most fields in the CDC's Restricted Access dataset are missing too many values to be useful. The fields that are reliably filled in are dates of reporting and symptoms, case status (lab confirmed or probable), state, county, sex, age, and race/ethnicity. Whether the person died was known for 52% of cases and whether they were hospitalized was known for 40% of cases. All other fields were known for 47% or fewer of the cases. Race/ethnicity was only known for 65% of cases, as opposed to 97%-100% for all the other fields below.
In [5]:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
_, _, chart = FieldAnalysis(PROJECT_ID, [CDC_TABLE], field_list, 'Most Complete Fields in the CDC Restricted Access Dataset')
chart.display()
  1. Total Case Counts: The CDC's Restricted Access dataset contains 78% of the cases reported in the New York Times (NYT) up to April 15. This discrepancy is not solely due to a time lag across all states; there's high variability at the state level with 29 states being within 15% of the NYT total case counts, and nine states with fewer than 50% of the NYT total case counts. Wyoming and Texas have the biggest discrepancies with only 2% and 3% of the total cases included, respectively. While it is expected that the CDC data will lag, a time lag alone can't explain the discrepancies in some states.
  2. Cases with Race/Ethnicity: Race/ethnicity is available for 65% of cases in the CDC dataset compared to 67% in the Covid Racial Data Tracker (CRDT). Race/ethnicity data availability is highly variable across different states, which is common to both the CDC and CRDT datasets, but the CRDT has more cases with race/ethnicity information than the CDC in all but two states, New York and Massachusetts.

We can evaluate the overall completeness of the CDC datasets by calculating at (1) the percent of total case counts compared to the NYT, (2) the percent of cases with race/ethnicity, and then (3) combine those two percentages into a composite that represents the percentage of total expected cases that have race/ethnicity. Later on, we will do this same analysis at the state and county levels.

In [6]:
#@title
# Manually update these fields based on chart above, latest CDC data,
# and improving state/county data below.

overall_row_names = [
    'Update frequency',
    'Latest case date',
    'Cases in dataset as of date',
    'Cases in NYT as of date',
    '(as a % of NYT)',
    'Cases with race/ethnicity',
    '(as a % of cases in dataset)',
    '(composite % of NYT total with race/ethnicity)',
]
overall_crdt_metadata = [
    'Stopped',
    'Mar 7, 2021',
    '28.8M',
    '29.0M',
    '(99%)', 
    '19.2M',
    '(67%)',
    '(66%)',
]
overall_cdc_metadata = [
    'Every two weeks',                        
    'April 15, 2021',
    '24.4M',
    '31.5M',
    '(78%)',
    '15.8M',
    '(65%)',
    '(50%)',
]
overall_cdc_public_geo_metadata = [
    'Every two weeks',                        
    'April 15, 2021',
    '24.4M',
    '31.5M',
    '(78%)',
    '12.0M',
    '(49%)',
    '(38%)',
]
table_data = {'CRDT': overall_crdt_metadata, 'CDC Restricted': overall_cdc_metadata, 'CDC Public Geo': overall_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=overall_row_names)
metadata_df.head(15)
Out[6]:
CRDT CDC Restricted CDC Public Geo
Update frequency Stopped Every two weeks Every two weeks
Latest case date Mar 7, 2021 April 15, 2021 April 15, 2021
Cases in dataset as of date 28.8M 24.4M 24.4M
Cases in NYT as of date 29.0M 31.5M 31.5M
(as a % of NYT) (99%) (78%) (78%)
Cases with race/ethnicity 19.2M 15.8M 12.0M
(as a % of cases in dataset) (67%) (65%) (49%)
(composite % of NYT total with race/ethnicity) (66%) (50%) (38%)

The CDC Restricted Access Dataset's completeness has improved over time. Each new data release updates historical cases to add more information and even new cases. The percent of cases with race/ethnicity started at 43% in May 2020 and is now at 65% in April 2021. The composite measure with the percent of NYT total cases that have race/ethnicity increased from 33% in May 2020 to 50% in April 2021.

In [7]:
#@title
total_count_history, race_ethnicity_known_history, historical_bar_chart = FieldAnalysis(
    PROJECT_ID, CDC_TABLE_HISTORY, ['race_ethnicity_combined'],
    'Race/Ethnicity Completeness by Dataset Release Date')

overall_nyt_percent = []
for i, table in enumerate(CDC_TABLE_HISTORY):
  nyt_us = pd.io.gbq.read_gbq(NYT_US_QUERY_STR % DATE_UP_TO_HISTORY[i], project_id=PROJECT_ID)
  overall_nyt_percent.append(total_count_history[i] / nyt_us.nyt_cases[0])

overall_composite = []
for i, _ in enumerate(total_count_history):
  overall_composite.append(race_ethnicity_known_history[i] * overall_nyt_percent[i])
In [8]:
#@title
CreateLineChart('CDC Restricted Access Dataset Completeness by Release Date',
                DATE_YEAR_HISTORY,
                race_ethnicity_known_history,
                overall_nyt_percent,
                overall_composite
).display()

We can also look more closely at how the percent of cases with race/ethnicity increased over time.

In [9]:
#@title
historical_bar_chart.display()

The improvements in data completeness were due to an increase in the percent cases with known race/ethnicity and a decrease in the percent of cases with data suppression over time from 9% in June 2020 to 1% in April 2021. Note that the May 2020 dataset had no data suppression possibly because it also had no state or county information.

At the end of this report, we'll look at how the CDC Restricted Access dataset's completeness has improved over time at the state level.

What we didn't include in this report:

Restricted Data Completeness Analysis

Data Overview

The CDC datasets come from a case report form that is a dense, five-page form about each lab-confirmed or probable COVID-19 case (old form prior to Jan 15, 2021). The Restricted Access dataset contains 32 fields, the Public Use with Geography dataset contains 19 fields, and the Public Use version contains 12 fields. In addition to data from the case report form, the CDC separately collects aggregate data without race/ethnicity information.

The CDC distinguishes between aggregate data that comes from state and local public health websites vs. line- or case-level data that comes to the CDC from public health departments. The CDC FAQs say that aggregate data are more accurate than case data:

Aggregate counts provide the most up-to-date validated numbers on cases and deaths.

Public health websites often do contain race/ethnicity details, but state websites do not all use the same standard race/ethnicity categories, which makes it hard to collect and aggregate the data. The CRDT captured the many non-standard ways in which different states report on race/ethnicity, where ethnicity is whether a person is Hispanic/Latino. Some states report race/ethnicity as a combined field where each race/ethnicity group is mutually exclusive, which is how the CDC case dataset reports this field. Other states report race/ethnicity as separate fields where Hispanic/Latino people are counted within different race groups as well as in a separate field for ethnicity. States can also differ in terms of which race categories they use, how they define them, whether multiracial people are counted multiple times in different categories, and what's included in the "Other" race category. For more details, see this CRDT analysis.

So, aggregate data are more accurate and timely than case-level data, but case-level data use standardized race/ethnicity categories across all states and counties. If we want to use case-level data, we need to better understand the severity of these accuracy and timeliness issues.

The CDC has an FAQ about data completeness:

How complete are the data that the CDC receives about COVID-19 cases?

The COVID-19 pandemic has put unprecedented demands on the public health data supply chain. In many states, the large number of COVID-19 cases has severely strained the ability of hospitals, healthcare providers, and laboratories to report cases with complete demographic information, such as race and ethnicity. The unprecedented volume of cases has also limited the ability of state and local health departments to conduct thorough case investigations and collect all requested case data.

As a result, many COVID-19 case notifications submitted to CDC do not have complete information on patient demographics [...] Because it can be time-consuming for jurisdictions to collect the additional information, these data can lag behind the aggregate counts. Because of missing data, analyses of these data elements are likely an underestimate of the true occurrence.

The CDC's COVID Data Tracker says that they are "working with states to provide more information on race/ethnicity for reported cases. The percent of reported cases that include race/ethnicity data is increasing."

Total Case Counts

We will compare the CDC data against two sources of aggregate data: The NYT and the CRDT's public data, which are aggregated from state and local public health websites. The CRDT is the only source for case data with race/ethnicity breakdowns, but there are several sources for county-level total case counts in addition to the NYT, such as JHU and USAFacts (this paper analyzes the differences between those sources at the state level up to July, 2020 for cases and deaths).

The table below compares geographic vs. race/ethnicity availability for these three different data sources:

In [10]:
#@title
row_names = [
    'Total Cases — States',
    'Total Cases — Counties',
    'Cases by Race/Ethnicity — States',
    'Cases by Race/Ethnicity — Counties'
]
nyt_yn = [
    '✅',
    '✅',
    '❌',
    '❌',
]
crdt_yn = [
    '✅',
    '❌',
    '✅',
    '❌',
]
cdc_yn = [
    '✅',
    '✅',
    '✅',
    '✅',
]
table_data = {'NYT': nyt_yn, 'CRDT': crdt_yn, 'CDC': cdc_yn}
availability_df = pd.DataFrame(table_data, index=row_names)
availability_df.head()
Out[10]:
NYT CRDT CDC
Total Cases — States
Total Cases — Counties
Cases by Race/Ethnicity — States
Cases by Race/Ethnicity — Counties

Because the CDC is the only data source that has race/ethnicity at the county level, the most similar data for purposes of comparison are (1) NYT data at the state and county levels with no race/ethnicity, and (2) CRDT data at the state level with race/ethnicity.

We will compare against the NYT up to April 15, 2021, which is the latest reporting date in the CDC data. We will compare against the CRDT up to March 7, 2021, which was the last date they reported data. We expect to see small differences (e.g., +/- 15%) in the case counts due to time lags in reporting the data, but time lags cannot explain large percentages of missing cases.

Baseline: CRDT vs. NYT

To get a baseline of how much we could expect the CDC case counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the CRDT and NYT case counts are equal.

In [11]:
#@title
crdt_states_df = pd.io.gbq.read_gbq(CRDT_QUERY, project_id=PROJECT_ID)
crdt_states_df.set_index('state', inplace=True)

nyt_states_compare_crdt_df = CreateNYTStateDataframe(NYT_STATES_COMPARE_CRDT_QUERY)

crdt_states_df.reset_index(inplace=True)
crdt_states_df['state_fips_code'] = crdt_states_df.state
crdt_states_df = crdt_states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_states_df.set_index('state_fips_code', inplace=True)
nyt_crdt_counts_df = nyt_states_compare_crdt_df.join(crdt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')

nyt_crdt_counts_df['percent'] = round(nyt_crdt_counts_df.crdt_cases / nyt_crdt_counts_df.nyt_cases, 2)
nyt_crdt_counts_df.reset_index(inplace=True)
In [12]:
#@title
nyt_crdt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CRDT to NYT'},
}
nyt_crdt_title = 'Ratio of CRDT to NYT Cases by State up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME

CreateScatterPlotAndMap(
    nyt_crdt_counts_df, nyt_crdt_fields_dict, nyt_crdt_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(nyt_crdt_counts_df)

The ratio of CRDT to NYT cases is between 0.92 and 1.03 for all states:

  • Average = 1.00
  • Median = 1.00
  • Min = 0.91 (Missouri)
  • Max = 1.03 (Georgia)
  • Percent between 0.85 and 1.15 = 100% (50 states + D.C. within +/- 0.15)

States: CDC vs. NYT

We can see below that the CDC Restricted Access dataset case counts differ from the NYT case counts more drastically than the CRDT did. Note: In the analysis and charts below, we'll refer to the CDC Restricted Access dataset as the "CDC" dataset.

In [13]:
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY)

cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
In [14]:
#@title
cdc_nyt_state_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_state_title = 'Ratio of CDC to NYT Cases by State up to %s' % DATE_DISPLAY_NAME

CreateScatterPlotAndMap(
    cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_nyt_states_df)

Texas alone is missing 2.8M cases compared to the total case counts in the NYT data (39% of the 7.1M missing cases across all states).

The ratio of CDC to NYT cases is between 0.02 and 1.04 for all states + D.C.:

  • Average = 0.76
  • Median = 0.93
  • Min = 0.02 (Wyoming)
  • Max = 1.04 (New York)
  • Percent between 0.85 and 1.15 = 57% (29 states within +/- 0.15)
  • Percent between 0.50 and 1.50 = 82% (42 states within +/- 0.50)

The 29 states that are within +/-15% of the NYT data could plausibly be off due to time lags in reporting cases to the CDC vs. reporting them on state public health websites, but there are many outlier states that are too far off from the NYT case counts to be explained by a time lag:

  • 9 states: < 0.50 ratio of CDC to NYT cases
  • 5 states: < 0.10 ratio of CDC to NYT cases (Wyoming, Texas, Louisiana, West Virginia, and New Hampshire)

Counties: CDC vs. NYT

We can do the same analysis at the county level. The CDC Restricted Access dataset contains 3,072 counties in the 50 states + D.C., which is 98% of all counties that account for 99.9% of the population.

Each dot is a county (hover to see details). We show all 3,063 counties in the CDC data that were also in the NYT data on the left and zoom in on the smaller counties on the right. Note that the five counties in New York City and four boroughs in Alaska are missing because the NYT combined them into one region for New York City and two combined regions in Alaska; see the Appendix for more details.

In [15]:
#@title
# CDC vs. NYT county
cdc_counties_race_df = CreateCDCCountyRaceDataframe(CDC_COUNTIES_RACE_QUERY, 'cdc_')
nyt_counties_df = CreateNYTCountyDataframe(NYT_COUNTIES_QUERY)

cdc_nyt_counties_race_df = cdc_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_nyt_counties_race_df = cdc_nyt_counties_race_df.reset_index()
cdc_nyt_counties_race_df['percent'] = round(cdc_nyt_counties_race_df.cdc_cases / cdc_nyt_counties_race_df.nyt_cases, 2)
cdc_nyt_counties_race_df['cdc_known_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_cases /
                                                            cdc_nyt_counties_race_df.cdc_cases, 2)
cdc_nyt_counties_race_df['cdc_known_or_na_cases'] = (cdc_nyt_counties_race_df.cdc_known_cases.fillna(0) +
                                                     cdc_nyt_counties_race_df.na_cases.fillna(0))
cdc_nyt_counties_race_df['cdc_known_or_na_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_or_na_cases /
                                                                  cdc_nyt_counties_race_df.cdc_cases, 2)

#PrintSummaryStats(cdc_nyt_counties_race_df)
# These values are used in the num_counties table.
#print(len(cdc_counties_race_df))
#print(len(cdc_counties_race_df) / 3143)
#print(cdc_counties_race_df.total_pop.sum())
#print(cdc_counties_race_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print(cdc_counties_race_df.cdc_known_cases.sum())
#print(0.55 * 324697795) # NYT population
In [16]:
#@title
cdc_nyt_county_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_county_title = 'Ratio of CDC to NYT Cases by County up to %s' % DATE_DISPLAY_NAME
zoom_cdc_nyt_title = 'Zoom in on counties with up to 200,000 Cases'

cdc_nyt_county_scatter = CreateScatterPlot(
    cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
cdc_nyt_county_zoom_scatter = CreateScatterPlot(
    cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, zoom_cdc_nyt_title, COUNTY_CASES_ZOOM_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)

(cdc_nyt_county_scatter | cdc_nyt_county_zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=MAP_HEIGHT - 50
).configure_mark(
    stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df)