How complete are the CDC's COVID-19 Case Surveillance datasets for race/ethnicity at the state and county levels?

April 5, 2021

In [1]:
#@title
# How to update the data:
# 1. You may need to copy this colab so you have your own version.
# 2. Update the table name constants below to have the latest data's suffix.
# 3. Update the date variables below to be the last case date included in the data.
# 4. Updates the scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
#    Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
#    the paper including text and tables, a lot of that is done in commented out PrintSummaryStats() statements.

import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)

# Project and table names.
PROJECT_ID = 'msm-secure-data-1b'
CDC_TABLE = '`%s.ndunlap_secure.cdc_restricted_access_20210331`' % PROJECT_ID
CDC_PUBLIC_GEO_TABLE = '`%s.ndunlap_secure.cdc_public_use_with_geo_20210331`' % PROJECT_ID
CRDT_TABLE = '`%s.ndunlap_secure.crdt_20210307`' % PROJECT_ID
CREW_TABLE = '`msm-internal-data.crew.covid_case_surveillance`'

# Dates in different formats.
DATE = 'DATE(2021, 03, 16)'
DATE_DISPLAY_NAME = 'Mar 16'
CRDT_DATE = '20210307'
CRDT_COMPARE_DATE = 'DATE(2021, 03, 07)'
CRDT_COMPARE_DATE_DISPLAY_NAME = 'Mar 7'
TOTAL_CRDT_CASES = 28756184	 # from https://covidtracking.com/data/national

# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
TOTAL_CASES_SCALE_MAX = 4000000
COUNTY_CASES_SCALE_MAX = 1400000
COUNTY_CASES_ZOOM_SCALE_MAX = 200000
CASES_RACE_SCALE_MAX = 3000000 # known race/ethnicity
COUNTY_CASES_RACE_SCALE_MAX = 400000 # known race/ethnicity

# Chart settings.
SCATTER_HEIGHT = 300
SCATTER_WIDTH = 300
MAP_HEIGHT = 300
MAP_WIDTH = 450
US_STATES_TOPO = alt.topo_feature(data.us_10m.url, 'states')
US_COUNTIES_TOPO = alt.topo_feature(data.us_10m.url+"#", 'counties')

TERRITORIES = ('PR', 'GU', 'VI', 'MP', 'AS')
NYT_TERRITORIES = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
STATES_TO_FIPS = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78, 'NYC': 36}
FIPS_TO_STATES = {STATES_TO_FIPS[key]: key for key in STATES_TO_FIPS}
RACE_ETHNICITY_COMBINED_MAP = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}
RACE_ETHNICITY_COMBINED_KNOWN_MAP = {
   'Asian, Non-Hispanic': 'cdc_known_cases',
   'Black, Non-Hispanic': 'cdc_known_cases',
   'White, Non-Hispanic': 'cdc_known_cases',
   'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
   'Hispanic/Latino': 'cdc_known_cases',
   'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
   'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
   'Missing': 'cdc_unknown_cases',
   'Unknown': 'cdc_unknown_cases',
   'NA': 'cdc_na_cases',
}
In [2]:
#@title
CDC_OVERALL_RACE_QUERY = ('''
SELECT
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
GROUP BY
   1
''' % CDC_TABLE)

CRDT_QUERY = ('''
SELECT
  State as state,
  Cases_Total as crdt_cases,
  Cases_Total - Cases_Unknown as crdt_known_race_cases,
  ROUND(1 - Cases_Unknown / Cases_Total, 4) as crdt_known_race_cases_percent,
  Cases_Total - Cases_Ethnicity_Unknown as crdt_known_ethnicity_cases,
  ROUND(1 - Cases_Ethnicity_Unknown / Cases_Total, 4) as crdt_known_ethnicity_cases_percent,
FROM %s
WHERE
  date = %s
''' % (CRDT_TABLE, CRDT_DATE))

NYT_STATES_QUERY_STR = ('''
SELECT
  state_name,
  state_fips_code,
  confirmed_cases as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''')

NYT_STATES_QUERY = NYT_STATES_QUERY_STR % DATE
NYT_STATES_COMPARE_CRDT_QUERY = NYT_STATES_QUERY_STR % CRDT_COMPARE_DATE

CDC_STATES_QUERY = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_state
''' % CDC_TABLE)

NYT_COUNTIES_QUERY = ('''
SELECT
  county_fips_code,
  confirmed_cases as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s AND
  county_fips_code IS NOT NULL
''' % DATE)

CDC_COUNTIES_RACE_QUERY = ('''
SELECT
  res_state,
  res_county,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % CDC_TABLE)

COUNTY_FIPS_MAPPING_QUERY = ('''
SELECT
*
FROM
  `msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')

ACS_POPULATION_DATA_QUERY = ('''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''')

CDC_STATES_RACE_QUERY = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
GROUP BY
   res_state,
   race_ethnicity_combined
''' % CDC_TABLE)

CDC_STATES_RACE_UP_TO_CRDT_QUERY = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  cdc_case_earliest_dt <= %s
GROUP BY
   res_state,
   race_ethnicity_combined
''' % (CDC_TABLE, CRDT_COMPARE_DATE))

CDC_PUBLIC_GEO_STATES_QUERY = ('''
SELECT
  res_state,
  COUNT(*) as cdc_public_geo_cases
FROM
  %s
GROUP BY
   res_state
''' % CDC_PUBLIC_GEO_TABLE)

CDC_PUBLIC_GEO_COUNTIES_RACE_QUERY = ('''
SELECT
  res_state,
  res_county,
  CASE ethnicity = "Non-Hispanic/Latino"
    WHEN true THEN race
    ELSE ethnicity
  END as race_ethnicity_combined,
  COUNT(*) as cdc_public_geo_cases
FROM
  %s
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % CDC_PUBLIC_GEO_TABLE)

CDC_PUBLIC_GEO_STATES_RACE_QUERY = ('''
SELECT
  res_state,
  CASE ethnicity = "Non-Hispanic/Latino"
    WHEN true THEN race
    ELSE ethnicity
  END as race_ethnicity_combined,
  COUNT(*) as cdc_public_geo_cases
FROM
  %s
GROUP BY
   res_state,
   race_ethnicity_combined
''' % CDC_PUBLIC_GEO_TABLE)
In [3]:
#@title
def FieldAnalysis(project_id, table, field_list, calculate_race_ethnicity=False):
  dict = {}
  for field in field_list:
      dict[field] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_series = []
  value_series = []
  percent_series = []
  cases_series = []
  chart_denominator = 1000000

  for field in field_list:
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    GROUP BY
      %s
    ''')
    if calculate_race_ethnicity and field == 'race_ethnicity_combined':
      field_unknowns_query = ('''
      SELECT
        CASE ethnicity = "Non-Hispanic/Latino"
          WHEN true THEN race
          ELSE ethnicity
        END as %s,
        count(*) as cases
      FROM
        %s
      GROUP BY
        %s
      ''')    
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    field_display_name = {
        'cdc_case_earliest_dt': 'CDC earliest case date',
        'current_status': 'Case status',
        'res_state': 'State',
        'res_county': 'County',
        'sex': 'Sex',
        'age_group': 'Age',
        'race_ethnicity_combined': 'Race/Ethnicity',
        'race': 'Race',
        'ethnicity': 'Ethnicity',
        'case_month': 'Case month'
    }

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    if '' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc[''].cases
    if 'OTH' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['OTH'].cases
    if 'nul' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['nul'].cases
    unknowns.loc['Missing', field] = missing_count

    if 'Unknown' in field_unknowns_df.index:
      unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases
    if 'NA' in field_unknowns_df.index:
      unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases
    unknowns.loc['Known', field] = field_unknowns_df.cases.sum() - (
        unknowns.loc['Missing', field] +
        unknowns.loc['Unknown', field] +
        unknowns.loc['NA', field])
    field_series.extend([field_display_name.get(field, field)] * 4)
    value_series.extend(['Known', 'Suppressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['NA', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['Unknown', field] / field_unknowns_df.cases.sum(),
                           unknowns.loc['Missing', field] / field_unknowns_df.cases.sum()])
    cases_series.extend([unknowns.loc['Known', field] / chart_denominator,
                           unknowns.loc['NA', field] / chart_denominator,
                           unknowns.loc['Unknown', field] / chart_denominator,
                           unknowns.loc['Missing', field] / chart_denominator])
    bars = pd.DataFrame.from_dict({'field': field_series,
                               'value': value_series,
                               'percent': percent_series,
                               'cases': cases_series})
  return alt.Chart(bars).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field', sort='x', title='Field'),
      color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
      ]
  )

def CreateNYTStateDataframe(query):
  nyt_states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for territory in NYT_TERRITORIES:
    nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
  nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
  nyt_states_df.set_index('state_fips_code', inplace=True)
  return nyt_states_df

def CreateCDCStateDataframe(query):
  states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
    states_df = states_df[states_df.res_state != state]
  states_df.rename(columns={'res_state': 'state'}, inplace=True)
  states_df['state_fips_code'] = states_df.state
  states_df = states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
  states_df['state_fips_code'] = states_df.state_fips_code.astype(int)
  states_df.set_index('state_fips_code', inplace=True)
  return states_df

def CreateCDCStateRaceDataframe(query, cases_field_prefix):
  states_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for state in ('Unknown', 'NA', 'Missing', 'OCONUS'):
    states_df = states_df[states_df.res_state != state]

  states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
  states_df = states_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_KNOWN_MAP})
  states_df.rename(columns={'res_state': 'state'}, inplace=True)

  cases_field = cases_field_prefix + 'cases' 
  crosstab_df = pd.crosstab(states_df['state'],
                            states_df.race_ethnicity_combined,
                            values=states_df[cases_field],
                            aggfunc=sum,
                            margins=True,
                            margins_name=cases_field
  )
  # Have to reset_index() to go from pandas multi-index to single index.
  crosstab_df = crosstab_df.reset_index()
  crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
  crosstab_df[cases_field_prefix + 'known_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
  crosstab_df[cases_field_prefix + 'known_or_na_cases'] = crosstab_df[cases_field] - crosstab_df.cdc_unknown_cases.fillna(0)
  crosstab_df[cases_field_prefix + 'known_cases_percent'] = round(
      crosstab_df[cases_field_prefix + 'known_cases'] /
      crosstab_df[cases_field], 4)
  crosstab_df[cases_field_prefix + 'known_or_na_cases_percent'] = round(
      crosstab_df[cases_field_prefix + 'known_or_na_cases'] /
      crosstab_df[cases_field], 4)
  crosstab_df['state_fips_code'] = crosstab_df.state
  crosstab_df = crosstab_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})

  # Remove territories and missing states for calculating summary stats.
  for territory in TERRITORIES:
    crosstab_df = crosstab_df[crosstab_df.state != territory]
  crosstab_df = crosstab_df[crosstab_df.state != 'NA']
  crosstab_df = crosstab_df[crosstab_df.state != 'Missing']
  crosstab_df = crosstab_df[crosstab_df.state != 'Unknown']
  crosstab_df.set_index('state_fips_code', inplace=True)
  return crosstab_df

def CreateCRDTStateRaceDataframe(query):
  crdt_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for territory in TERRITORIES:
    crdt_df = crdt_df[crdt_df.state != territory]
  crdt_df['state_fips_code'] = crdt_df.state
  crdt_df = crdt_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
  crdt_df['state_fips_code'] = crdt_df.state_fips_code.astype(int)
  crdt_df.set_index('state_fips_code', inplace=True)
  return crdt_df

def CreateNYTCountyDataframe(query):
  nyt_counties_df = pd.io.gbq.read_gbq(NYT_COUNTIES_QUERY, project_id=PROJECT_ID)
  nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
  nyt_counties_df.county_fips.unique()
  nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
  nyt_counties_df.set_index('county_fips', inplace=True)
  return nyt_counties_df

def CreateCDCCountyRaceDataframe(query, cases_field_prefix):
  cdc_counties_race_df = pd.io.gbq.read_gbq(query, project_id=PROJECT_ID)
  for territory in TERRITORIES:
    cdc_counties_race_df = cdc_counties_race_df[cdc_counties_race_df.res_state != territory]

  county_fips_map_df = pd.io.gbq.read_gbq(COUNTY_FIPS_MAPPING_QUERY, project_id=PROJECT_ID)

  county_fips_map_df.cdc_county = county_fips_map_df.cdc_county.str.lower()
  county_fips_map_df['state_county'] = county_fips_map_df.state + '-' + county_fips_map_df.cdc_county
  county_fips_map_df['state_county'] = county_fips_map_df.state_county.astype('string').str.strip()
  county_fips_map_df.set_index('state_county', inplace=True)

  # Concatenate the state and county names because county names are not unique across states.
  cdc_counties_race_df.res_county = cdc_counties_race_df.res_county.str.lower()
  cdc_counties_race_df['state_county'] = cdc_counties_race_df.res_state + '-' + cdc_counties_race_df.res_county
  cdc_counties_race_df['state_county'] = cdc_counties_race_df.state_county.astype('string').str.strip()
  cdc_counties_race_df.set_index('state_county', inplace=True)
  cdc_counties_race_df['race_ethnicity_combined'] = cdc_counties_race_df.race_ethnicity_combined.astype('string').str.strip()
  cdc_counties_race_df = cdc_counties_race_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})

  # Printed value used in the footnotes below.
  # All other checks for county_fips_code mappings are now in
  # https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006
  mismatches_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
  mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
  mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
  mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
  mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
  mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
  mismatches_df = mismatches_df[mismatches_df.res_county != 'missing']
  # print(mismatches_df.cases.sum())

  cdc_counties_race_df = cdc_counties_race_df.join(county_fips_map_df, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')

  cases_field = cases_field_prefix + 'cases'
  # Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
  cdc_counties_race_crosstab_df = pd.crosstab(cdc_counties_race_df['county_fips'],
                                              cdc_counties_race_df.race_ethnicity_combined,
                                              values=cdc_counties_race_df[cases_field],
                                              aggfunc=sum,
                                              margins=True,
                                              margins_name=cases_field
 )
  # Have to reset_index() to go from pandas multi-index to single index.
  cdc_counties_race_crosstab_df = cdc_counties_race_crosstab_df.reset_index()
  cdc_counties_race_crosstab_df.drop(axis=0, index=len(cdc_counties_race_crosstab_df) - 1, inplace=True)
  cdc_counties_race_crosstab_df['county_fips'] = cdc_counties_race_crosstab_df.county_fips.astype(int)
  cdc_counties_race_crosstab_df[cases_field_prefix + 'known_cases'] = (
      cdc_counties_race_crosstab_df[cases_field] -
      cdc_counties_race_crosstab_df.na_cases.fillna(0) -
      cdc_counties_race_crosstab_df.unknown_cases.fillna(0))

  # Get the display names for each county.
  # Use ACS data that only has one FIPS code per county unlike the fips_county_map.
  acs_name_lookup_df = pd.io.gbq.read_gbq(ACS_POPULATION_DATA_QUERY, project_id=PROJECT_ID)

  acs_name_lookup_df['state_county'] = (acs_name_lookup_df.county.astype('string').str.strip() +
                                        ', ' + acs_name_lookup_df.state.astype('string').str.strip())
  acs_name_lookup_df.drop(columns=['state', 'county'], inplace=True)
  acs_name_lookup_df.set_index('county_fips', inplace=True)

  cdc_counties_race_df = cdc_counties_race_crosstab_df.join(acs_name_lookup_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
  cdc_counties_race_df.county_fips = cdc_counties_race_df.county_fips.astype(int)
  cdc_counties_race_df.set_index('county_fips', inplace=True)
  return cdc_counties_race_df

def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = US_STATES_TOPO
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = US_COUNTIES_TOPO

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(US_STATES_TOPO).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(US_STATES_TOPO).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def PrintSummaryStats(chart_df, field='percent'):
  plus_minus_15_df = chart_df[chart_df[field] >= .85]
  plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
  print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
  plus_minus_50_df = chart_df[chart_df[field] >= .50]
  plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
  print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())

Abstract

The Covid Tracking Project was the most reliable source for COVID-19 data with race/ethnicity at the state level until it stopped collecting data on March 7, 2021. The CDC's Case Surveillance Restricted Access and Public Use with Geography datasets could potentially replace the Covid Tracking Project's dataset and additionally enable analyses of age and race/ethnicity along with county-level data. This paper evaluates the completeness of the CDC datasets at the state and county levels in terms of (1) the total number of cases included compared to the New York Times, and (2) the number of cases included with race/ethnicity data compared to the Covid Tracking Project.

The CDC's Restricted Access dataset contains 76% of the cases in the New York Times up to March 16, and 57% of cases have race/ethnicity information vs. 67% in the Covid Tracking Project. At the state and county levels, the dataset's completeness is highly variable; for example, Minnesota has 101% of the cases included in the New York Times, while Louisiana has 5% of the cases in the New York Times. Minnesota has 91% of cases with race/ethnicity, while Louisiana has 19% with race/ethnicity (vs. 94% in the Covid Tracking Project). Texas alone is missing 2.7M cases (more than a third of the total 7.1M missing cases). California is missing race/ethnicity for 2.6M people (more than a quarter of the 9.7M cases missing race/ethnicity) including all 1.5M Hispanic/Latino cases reported on the state public health website. The CDC's Public Use with Geography dataset is similar to the Restricted Access dataset for total case counts, but is less complete due to more privacy suppression; e.g., only 46% of cases have race/ethnicity information.

The CDC datasets are useful for the individual states and counties that have a high degree of data completeness, but we hope that the datasets' completeness will continue to improve across all states and counties.

Background

The racial and ethnic disparities in the COVID-19 pandemic have exposed longstanding health inequities in the U.S., which have been described in multiple analyses of COVID-19 data by the Covid Tracking Project, New York Times, American Public Media Research Lab, and Kaiser Family Foundation among many others. Unfortunately, we still don't have a full understanding of these disparities because race/ethnicity data continue to be fragmented and incomplete. On January 29, the Covid Tracking Project wrote, "the continued lack of either complete federal demographic data or federal guidelines for what states should publish make it impossible to fully understand who is being infected with and dying of COVID-19."

The most reliable and up-to-date data are scattered across state and local public health websites that use different standards and categories for reporting race/ethnicity. In 2020, collecting these data and turning them into a unified dataset was largely left to non-governmental organizations like the Covid Tracking Project, which stopped collecting data on March 7, 2021, a full year after it started. Even the federal government looked to the Covid Tracking Project for reliable COVID-19 race/ethnicity data. The office of the Assistant Secretary for Planning and Evaluation, an agency within the U.S. Department of Health and Human Services, wrote in October 2020, "The volunteer-based COVID tracking project has created the most comprehensive centralized resource for race and ethnicity data at the state level."

The outlook for race/ethnicity data on cases is even bleaker at the county level. Since November 2020, the CDC has shown total case counts at the county level in a dashboard. Before the CDC published that data, several non-governmental organizations (New York Times, Johns Hopkins University, USAFacts) gathered data for total case counts at the county level. But none of these sources collect or publish race/ethnicity data, which would be a huge undertaking due to the non-standard reporting of race/ethnicity across state and local public health websites. The only public analysis of case data with race/ethnicity at the county level was in July 2020 when the New York Times published The Fullest Look Yet at the Racial Inequity of Coronavirus. The New York Times used CDC case surveillance data that they obtained via FOIA and legal action to analyze cases up to May 28, 2020.

Now that the Covid Tracking Project has stopped collecting data, how can we track the disproportionate impact of COVID-19 on communities of color in the U.S. at the state and county levels? There is only one option for public COVID-19 case data with race/ethnicity as a unified dataset across U.S. states and counties: the CDC's case surveillance data, which is based on state and local health departments reporting cases to the CDC. There are more options for data on deaths, which we discuss in a separate deaths data report.

The CDC publishes several variations of this dataset:

  1. Public Use dataset at the U.S. level (dashboard)
  2. Public Use with geography dataset with state and county information (dashboard)
  3. Restricted access dataset with state and county information

These three datasets are from the same underlying source; they have the same number of records and are released at the end of each month. The Public Use dataset was first published in May 2020. The Restricted Access dataset was first published at the link above in November 2020; the dataset itself is in a private GitHub repository that the CDC shares with people who apply for access and adhere to a data use agreement. The CDC's initial restricted access data agreement did not allow for county-level analyses to be made public, but an updated data agreement from December 14, 2020 allowed such analyses. In January 2021, the Morehouse School of Medicine's Satcher Health Leadership Institute (MSM/SHLI) in collaboration with Google.org applied for and got access to this data within a few days. Several months later in March 2021, the CDC published the Public Use with Geography dataset with state and county information, fewer fields, and more data suppression than in the Restricted Access dataset. In addition to these datasets, the CDC also publishes charts showing race/ethnicity breakdowns at the U.S. level in their COVID Data Tracker; see the Appendix for a comparison of that tracker to these datasets.

The CDC's Case Surveillance Restricted Access dataset has enormous potential: It could allow us to analyze data across all states and counties to study the disparities in COVID-19 cases using consistent race/ethnicity categories. This dataset could pick up where the Covid Tracking Project's dataset left off and additionally enable the first analysis of race/ethnicity disparities at the county level since July 2020. The dataset also has age and sex for each case, so we could analyze the intersection of race/ethnicity with age and sex. The dataset has case report dates, which would allow us to look at cases over time. However, the dataset will only live up to its potential if it is complete both in terms of the number of cases included and the number of cases that have race/ethnicity.

Unfortunately, the CDC's Case Surveillance Restricted Access dataset has significant completeness issues:

  • Only 76% of total cases in the New York Times up to March 16 are included (7.1M out of 29.6M cases are missing)
  • Of the cases in the dataset, only 57% have known race/ethnicity (9.7M out of 22.5M cases are missing race/ethnicity)

For the 12.8M cases where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:

In [4]:
#@title
overall_df = pd.io.gbq.read_gbq(CDC_OVERALL_RACE_QUERY, project_id=PROJECT_ID)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': RACE_ETHNICITY_COMBINED_MAP})
overall_df = overall_df.set_index('race_ethnicity_combined')

chart_denominator = 1000000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
         overall_df.cases['black_cases'] / chart_denominator,
         overall_df.cases['white_cases'] / chart_denominator,
         overall_df.cases['asian_cases'] / chart_denominator,
         overall_df.cases['nhpi_cases'] / chart_denominator,
         overall_df.cases['aian_cases'] / chart_denominator,
         overall_df.cases.sum() / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.1%'), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.2%', title='Prevalence within group'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Cases in group (millions)'),
                  alt.Tooltip('population:Q', format=',.2f', title='Population of group (millions)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who had COVID-19 based on Incomplete CDC Data up to %s' % DATE_DISPLAY_NAME
)

alt.concat(bars).properties(
    title=alt.TitleParams(
        ['Population data source: The U.S. Census Bureau\'s American Community Survey 2019 5-year estimates.'],
        baseline='bottom',
        dy=20,
        orient='bottom',
        fontWeight='normal',
        fontSize=11
    )
).display()

But the chart above is based on incomplete data. With only 76% of cases included, the total percent of people who had COVID-19 should be 9.0% instead of 6.9%. It's harder to estimate how much the individual race/ethnicity data are undercounting the true number of confirmed COVID-19 cases. For example, the CDC data say that 0% of cases in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 55.5% of California cases (1.5M people) as of March 17.

If we added all 9.7M cases with missing race/ethnicity to the Hispanic/Latino group, the percent of Hispanic/Latinos in the U.S. who had COVID-19 would go from 4.3% to 20.2% — a 5x increase. If all 9.7M cases with missing race/ethnicity were Black people, the percent of Black people who had COVID-19 would go from 3.8% to 27.6% — an 7x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.

At the same time, the data from state public health websites are not perfect; the Covid Tracking Project only has race/ethnicity data for 67% of cases up to March 7. We'll compare data completeness at the state level in the case surveillance data vs. the New York Times and the Covid Tracking Project. If we can understand how complete the data are at the state level in the CDC's dataset, then we can confidently use that dataset to replace the Covid Tracking Project's dataset in certain states and know which states still have more reliable data on their public health websites.

Overview

The goal of this analysis is to assess the completeness of the CDC's Restricted Access and Public Use with Geography datasets to evaluate their feasibility in examining disparities in race/ethnicity for COVID-19 cases at the state and county levels. We'll first focus on the Restricted Access dataset, which is more complete than the Public Use with Geography dataset due to privacy reasons.

The overall data completeness findings for the CDC's Restricted Access Dataset are:

  1. Data Overview: Most fields in the CDC's Restricted Access dataset are missing too many values to be useful. The fields that are reliably filled in are dates of reporting and symptoms, case status (lab confirmed or probable), state, county, sex, age, and race/ethnicity. Whether the person died was known for 53% of cases and whether they were hospitalized was known for 40% of cases. All other fields were known for 46% or fewer of the cases. Race/ethnicity was only known for 57% of cases, as opposed to 97%-100% for all the other fields below.
In [5]:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
FieldAnalysis(PROJECT_ID, CDC_TABLE, field_list).display()
  1. Total Case Counts: The CDC's Restricted Access dataset contains 76% of the cases reported in the New York Times (NYT) up to March 16. This discrepancy is not solely due to a time lag across all states; there's high variability at the state level with just over half of states being within 15% of the NYT total case counts, about a quarter of states having fewer than 50% of the NYT total case counts. Wyoming and Texas have the biggest discrepancies with only 2% and 3% of the total cases included, respectively. While it is expected that the CDC data will lag, a time lag alone can't explain the discrepancies in some states.
  2. Cases with Race/Ethnicity: Race/ethnicity is available for 57% of cases in the CDC dataset compared to 67% in the Covid Racial Data Tracker (CRDT). Race/ethnicity data availability is highly variable across different states, which is common to both the CDC and CRDT datasets, but the CRDT has more cases with race/ethnicity information than the CDC in all but two states, New York and Massachusetts.

We can evaluate the overall completeness of the CDC datasets by calculating at (1) the percent of total case counts compared to the NYT, (2) the percent of cases with race/ethnicity, and then (3) combine those two percentages into a composite that represents the percentage of total expected cases that have race/ethnicity. Later on, we will do this same analysis at the state and county levels.

In [6]:
#@title
# Manually update these fields based on chart above, latest CDC data,
# and improving state/county data below.

overall_row_names = [
    'Update frequency',
    'Latest case date',
    'Cases in dataset as of date',
    'Cases in NYT as of date',
    '(as a % of NYT)',
    'Cases with race/ethnicity',
    '(as a % of cases in dataset)',
    '(composite % of NYT total with race/ethnicity)',
]
overall_crdt_metadata = [
    'Stopped',
    'Mar 7, 2021',
    '28.8M',
    '29.0M',
    '(99%)', 
    '19.2M',
    '(67%)',
    '(66%)',
]
overall_cdc_metadata = [
    'Monthly',                        
    'Mar 16, 2021',
    '22.5M',
    '29.6M',
    '(76%)',
    '12.8M',
    '(57%)',
    '(43%)',
]
overall_cdc_public_geo_metadata = [
    'Monthly',                        
    'Mar 16, 2021',
    '22.5M',
    '29.6M',
    '(76%)',
    '10.3M',
    '(46%)',
    '(35%)',
]
table_data = {'CRDT': overall_crdt_metadata, 'CDC Restricted': overall_cdc_metadata, 'CDC Public Geo': overall_cdc_public_geo_metadata}
metadata_df = pd.DataFrame(table_data, index=overall_row_names)
metadata_df.head(15)
Out[6]:
CRDT CDC Restricted CDC Public Geo
Update frequency Stopped Monthly Monthly
Latest case date Mar 7, 2021 Mar 16, 2021 Mar 16, 2021
Cases in dataset as of date 28.8M 22.5M 22.5M
Cases in NYT as of date 29.0M 29.6M 29.6M
(as a % of NYT) (99%) (76%) (76%)
Cases with race/ethnicity 19.2M 12.8M 10.3M
(as a % of cases in dataset) (67%) (57%) (46%)
(composite % of NYT total with race/ethnicity) (66%) (43%) (35%)

Overall, the CDC datasets' completeness falls short of the CRDT dataset, however there are some states where the two datasets are comparable. We also look at ways to improve both of the CDC's datasets at the state and county levels.

What we didn't include in this report:

Restricted Data Completeness Analysis

Data Overview

The CDC datasets come from a case report form that is a dense, five-page form about each lab-confirmed or probable COVID-19 case (old form prior to Jan 15, 2021). The Restricted Access dataset contains 32 fields, the Public Use with Geography dataset contains 19 fields, and the Public Use version contains 12 fields. In addition to data from the case report form, the CDC separately collects aggregate data that doesn't have race/ethnicity information.

The CDC distinguishes between aggregate data that comes from state and local public health websites vs. line- or case-level data that comes to the CDC from public health departments. The CDC FAQs say that aggregate data are more accurate than case data:

Aggregate counts provide the most up-to-date validated numbers on cases and deaths.

Public health websites often do contain race/ethnicity details, but state websites do not all use the same standard race/ethnicity categories, which makes it hard to collect and aggregate the data. The CRDT captured the many non-standard ways in which different states report on race/ethnicity, where ethnicity is whether a person is Hispanic/Latino. Some states report race/ethnicity as a combined field where each race/ethnicity group is mutually exclusive, which is how the CDC case dataset reports this field. Other states report race/ethnicity as separate fields where Hispanic/Latino people are counted within different race groups as well as in a separate field for ethnicity. States can also differ in terms of which race categories they use, how they define them, whether multiracial people are counted multiple times in different categories, and what's included in the "Other" race category. For more details, see this CRDT analysis.

So, we will need to sacrifice the accuracy and timeliness of aggregate data to get standardized race/ethnicity reporting on cases across all states and counties. However, standardized reporting on race/ethnicity is only useful if we have complete enough data in states and counties.

The CDC has an FAQ about data completeness:

How complete are the data that the CDC receives about COVID-19 cases?

The COVID-19 pandemic has put unprecedented demands on the public health data supply chain. In many states, the large number of COVID-19 cases has severely strained the ability of hospitals, healthcare providers, and laboratories to report cases with complete demographic information, such as race and ethnicity. The unprecedented volume of cases has also limited the ability of state and local health departments to conduct thorough case investigations and collect all requested case data.

As a result, many COVID-19 case notifications submitted to CDC do not have complete information on patient demographics [...] Because it can be time-consuming for jurisdictions to collect the additional information, these data can lag behind the aggregate counts. Because of missing data, analyses of these data elements are likely an underestimate of the true occurrence.

The CDC's COVID Data Tracker says that they are "working with states to provide more information on race/ethnicity for reported cases. The percent of reported cases that include race/ethnicity data is increasing."

Total Case Counts

We will compare the CDC data against two sources of aggregate data: The NYT and the CRDT's public data, which are aggregated from state and local public health websites. The CRDT is the only source for case data with race/ethnicity breakdowns, but there are several sources for county-level total case counts in addition to the NYT, such as JHU and USAFacts (this paper analyzes the differences between those sources at the state level up to July, 2020 for cases and deaths).

The table below compares geographic vs. race/ethnicity availability for these three different data sources:

In [7]:
#@title
row_names = [
    'Total Cases — States',
    'Total Cases — Counties',
    'Cases by Race/Ethnicity — States',
    'Cases by Race/Ethnicity — Counties'
]
nyt_yn = [
    '✅',
    '✅',
    '❌',
    '❌',
]
crdt_yn = [
    '✅',
    '❌',
    '✅',
    '❌',
]
cdc_yn = [
    '✅',
    '✅',
    '✅',
    '✅',
]
table_data = {'NYT': nyt_yn, 'CRDT': crdt_yn, 'CDC': cdc_yn}
availability_df = pd.DataFrame(table_data, index=row_names)
availability_df.head()
Out[7]:
NYT CRDT CDC
Total Cases — States
Total Cases — Counties
Cases by Race/Ethnicity — States
Cases by Race/Ethnicity — Counties

Because the CDC is the only data source that has race/ethnicity at the county level, the most similar data for purposes of comparison are (1) NYT data at the state and county levels with no race/ethnicity, and (2) CRDT data at the state level with race/ethnicity.

We will compare against the NYT up to March 16, 2021, which is the latest reporting date in the CDC data. We will compare against the CRDT up to March 7, 2021, which was the last date they reported data. We expect to see small differences (e.g., +/- 15%) in the case counts due to time lags in reporting the data, but time lags cannot explain large percentages of missing cases.

Baseline: CRDT vs. NYT

To get a baseline of how much we could expect the CDC case counts to match the NYT or CRDT, we can see how closely the NYT and CRDT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT case counts are equal.

In [8]:
#@title
crdt_states_df = pd.io.gbq.read_gbq(CRDT_QUERY, project_id=PROJECT_ID)
crdt_states_df.set_index('state', inplace=True)

nyt_states_compare_crdt_df = CreateNYTStateDataframe(NYT_STATES_COMPARE_CRDT_QUERY)

crdt_states_df.reset_index(inplace=True)
crdt_states_df['state_fips_code'] = crdt_states_df.state
crdt_states_df = crdt_states_df.replace(to_replace={'state_fips_code': STATES_TO_FIPS})
crdt_states_df.set_index('state_fips_code', inplace=True)
nyt_crdt_counts_df = nyt_states_compare_crdt_df.join(crdt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')

nyt_crdt_counts_df['percent'] = round(nyt_crdt_counts_df.crdt_cases / nyt_crdt_counts_df.nyt_cases, 2)
nyt_crdt_counts_df.reset_index(inplace=True)
In [9]:
#@title
nyt_crdt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CRDT to NYT'},
}
nyt_crdt_title = 'Ratio of CRDT to NYT Cases by State up to %s' % CRDT_COMPARE_DATE_DISPLAY_NAME

CreateScatterPlotAndMap(
    nyt_crdt_counts_df, nyt_crdt_fields_dict, nyt_crdt_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(nyt_crdt_counts_df)

The ratio of NYT to CRDT cases is between 0.92 and 1.03 for all states:

  • Average = 1.00
  • Median = 1.00
  • Min = 0.92 (Missouri)
  • Max = 1.03 (Georgia)
  • Percent between 0.85 and 1.15 = 100% (50 states + D.C. within +/- 0.15)

States: CDC vs. NYT

We can see below that the CDC Restricted Access dataset case counts differ from the NYT case counts more drastically than the CRDT did. Note: In the analysis and charts below, we'll refer to the CDC Restricted Access dataset as the "CDC" dataset.

In [10]:
#@title
cdc_states_df = CreateCDCStateDataframe(CDC_STATES_QUERY)
nyt_states_df = CreateNYTStateDataframe(NYT_STATES_QUERY)

cdc_nyt_states_df = cdc_states_df.join(nyt_states_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
cdc_nyt_states_df.reset_index(inplace=True)
cdc_nyt_states_df['percent'] = round(cdc_nyt_states_df.cdc_cases / cdc_nyt_states_df.nyt_cases, 4)
In [11]:
#@title
cdc_nyt_state_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_state_title = 'Ratio of CDC to NYT Cases by State up to %s' % DATE_DISPLAY_NAME

CreateScatterPlotAndMap(
    cdc_nyt_states_df, cdc_nyt_state_fields_dict, cdc_nyt_state_title, TOTAL_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, MAP_WIDTH, 'state', 'ratio'
).display()
#PrintSummaryStats(cdc_nyt_states_df)

Texas alone is missing 2.7M cases compared to the total case counts in the NYT data (38% of the 7.1M missing cases across all states).

The ratio of CDC to NYT cases is between 0.02 and 1.05 for all states + D.C.:

  • Average = 0.75
  • Median = 0.91
  • Min = 0.02 (Wyoming)
  • Max = 1.06 (New York)
  • Percent between 0.85 and 1.15 = 57% (29 states within +/- 0.15)
  • Percent between 0.50 and 1.50 = 78% (40 states within +/- 0.50)

The 29 states that are within +/-15% of the NYT data could plausibly be off due to time lags in reporting cases to the CDC vs. reporting them on state public health websites, but there are many outlier states that are too far off from the NYT case counts to be explained by a time lag:

  • 11 states: < 0.50 ratio of CDC to NYT cases
  • 4 states: < 0.10 ratio of CDC to NYT cases (Wyoming, Texas, Louisiana, and West Virginia)

Counties: CDC vs. NYT

We can do the same analysis at the county level. The CDC Restricted Access dataset contains 3,072 counties in the 50 states + D.C., which is 98% of all counties that account for 99.9% of the population.

Each dot is a county (hover to see details). We show all 3,063 counties in the CDC data that were also in the NYT data on the left and zoom in on the smaller counties on the right. Note that the five counties in New York City and four boroughs in Alaska are missing because the NYT combined them into one region for New York City and two combined regions in Alaska; see the Appendix for more details.

In [12]:
#@title
# CDC vs. NYT county
cdc_counties_race_df = CreateCDCCountyRaceDataframe(CDC_COUNTIES_RACE_QUERY, 'cdc_')
nyt_counties_df = CreateNYTCountyDataframe(NYT_COUNTIES_QUERY)

cdc_nyt_counties_race_df = cdc_counties_race_df.join(nyt_counties_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
cdc_nyt_counties_race_df = cdc_nyt_counties_race_df.reset_index()
cdc_nyt_counties_race_df['percent'] = round(cdc_nyt_counties_race_df.cdc_cases / cdc_nyt_counties_race_df.nyt_cases, 2)
cdc_nyt_counties_race_df['cdc_known_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_cases /
                                                            cdc_nyt_counties_race_df.cdc_cases, 2)
cdc_nyt_counties_race_df['cdc_known_or_na_cases'] = (cdc_nyt_counties_race_df.cdc_known_cases.fillna(0) +
                                                     cdc_nyt_counties_race_df.na_cases.fillna(0))
cdc_nyt_counties_race_df['cdc_known_or_na_cases_percent'] = round(cdc_nyt_counties_race_df.cdc_known_or_na_cases /
                                                                  cdc_nyt_counties_race_df.cdc_cases, 2)

#PrintSummaryStats(cdc_nyt_counties_race_df)
# These values are used in the num_counties table.
#print(len(cdc_counties_race_df))
#print(len(cdc_counties_race_df) / 3143)
#print(cdc_counties_race_df.total_pop.sum())
#print(cdc_counties_race_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print(cdc_counties_race_df.cdc_known_cases.sum())
#print(0.55 * 324697795) # NYT population
In [13]:
#@title
cdc_nyt_county_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT cases'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC cases'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_county_title = 'Ratio of CDC to NYT Cases by County up to %s' % DATE_DISPLAY_NAME
zoom_cdc_nyt_title = 'Zoom in on counties with up to 200,000 Cases'

cdc_nyt_county_scatter = CreateScatterPlot(
    cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, cdc_nyt_county_title, COUNTY_CASES_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)
cdc_nyt_county_zoom_scatter = CreateScatterPlot(
    cdc_nyt_counties_race_df, cdc_nyt_county_fields_dict, zoom_cdc_nyt_title, COUNTY_CASES_ZOOM_SCALE_MAX, SCATTER_HEIGHT, SCATTER_WIDTH, 'county', 'ratio'
)

(cdc_nyt_county_scatter | cdc_nyt_county_zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=MAP_HEIGHT - 50
).configure_mark(
    stroke='grey'
).display()
#PrintSummaryStats(cdc_nyt_counties_race_df)