COVID-19 Data

Data

https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv

Libraries

library(tidyverse)
library(knitr)
library(readxl)
library(zoo)

Start

url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
home = read_csv(url)

read_excel = '../data/PopulationEstimates.xls'

pop = readxl::read_excel("../data/PopulationEstimates.xls", skip = 2) %>% 
  select(pop2019 = POP_ESTIMATE_2019, fips = FIPStxt, State)

CA_pop_2019 = pop %>%
  filter(State == "CA") 

Question 1: Covid-19 California Cases

#Step 1

home = read_csv(url)

#Step 2

steptwo = home %>% filter(state == "California") %>% group_by(county) %>% 
  mutate(newCase = cases - lag(cases)) %>% arrange(-newCase) %>% ungroup()

#Step 3

#Table 1: 5 counties with most cumulative cases

most_cumulative_cases = steptwo %>% filter(date == max(date)) %>% group_by(county) %>%
  summarize(sum_cases = sum(cases)) %>% arrange(-sum_cases) %>%
  ungroup() %>% slice_max(sum_cases, n = 5)

knitr::kable(most_cumulative_cases, caption = c("Cumulative Case Counts: Top 5 CA counties"),
  col.names = c("County", "Cumulative Cases"))
Cumulative Case Counts: Top 5 CA counties
County Cumulative Cases
Los Angeles 253985
Riverside 55073
Orange 52121
San Bernardino 50699
San Diego 42742
#Step 2 (Table 2)

steptwo = home %>% filter(state == "California") %>% group_by(county) %>% 
  mutate(newCase = cases - lag(cases)) %>% arrange(-newCase) %>% ungroup()

#Step 3 (Table 2)

#Table 2: 5 counties with most new cases
most_new_cases = steptwo %>% filter(date == max(date)) %>% slice_max(newCase, n=5) %>% select (county, newCase)

knitr::kable(most_new_cases, caption = "New Case Counts: Top 5 CA counties", col.names = c("County", "New Cases"))
New Case Counts: Top 5 CA counties
County New Cases
Los Angeles 809
San Diego 265
Orange 185
Fresno 159
San Bernardino 156
#Cumulative Cases with Population (100,000)

pop = readxl::read_excel("../data/PopulationEstimates.xls", skip = 2) %>%
  select(pop2019 = "POP_ESTIMATE_2019", fips = "FIPStxt", state = "State")

CA_pop_covid = right_join(pop, home, by ="fips") %>% 
  filter(date >= max(date, na.rm = TRUE)-13, state.x == "CA") %>% 
  group_by(county) %>%
  mutate(newCase = cases - lag(cases),
         newCase_pc = newCase / pop2019,
         cumulative_pc = cases / pop2019) %>% 
  ungroup()

today = CA_pop_covid %>% 
  filter(date == max(date))

most_per_cap_today  = today %>% 
  slice_max(newCase_pc, n = 5)

most_per_cap_all_time = today %>% 
  slice_max(cumulative_pc, n = 5) %>% 
  select(county = county, cumulative_pc)


#Table 1
pop_most_cumulative_cases = CA_pop_covid %>% 
  filter(date == max(date)) %>% 
  slice_max(cases, n=5) %>% 
  select(county, cumulative_pc)

knitr::kable(pop_most_cumulative_cases, caption = c("Pop Cumulative Case Counts: Top 5 CA counties"),
             col.names = c("County", "Cumulative Cases with Pop (100,000)"))
Pop Cumulative Case Counts: Top 5 CA counties
County Cumulative Cases with Pop (100,000)
Los Angeles 0.0252996
Riverside 0.0222918
Orange 0.0164125
San Bernardino 0.0232555
San Diego 0.0128034
#Table 2

pop_most_new_cases = CA_pop_covid %>% filter(date == max(date)) %>% slice_max(newCase_pc, n=5) %>% 
  select(county, newCase_pc) 


knitr::kable(pop_most_new_cases, caption = c("Pop New Case Counts: Top 5 CA counties"),
             col.names = c("County", "New Cases with Pop (100,000)"))
Pop New Case Counts: Top 5 CA counties
County New Cases with Pop (100,000)
Kings 0.0002615
San Benito 0.0002388
Monterey 0.0002027
Lake 0.0001708
Fresno 0.0001591
#Table 3: Safe counties

pop_safe_county = CA_pop_covid %>% 
  group_by(county) %>% 
  summarize(totNew = sum(newCase_pc, na.rm= TRUE) /  (max(pop2019) /100000)) %>% 
  filter(totNew <= 100) 


knitr::kable(pop_safe_county, caption = c("Pop Lowest Case Counts: CA counties"),
             col.names = c("County", "Lowest New Cases with Pop")) 
Pop Lowest Case Counts: CA counties
County Lowest New Cases with Pop
Alameda 0.0000646
Alpine 0.0000000
Amador 0.0024047
Butte 0.0012551
Calaveras 0.0029422
Colusa 0.0094772
Contra Costa 0.0001062
Del Norte 0.0007757
El Dorado 0.0001802
Fresno 0.0002117
Glenn 0.0099236
Humboldt 0.0003211
Imperial 0.0016901
Inyo 0.0052243
Kern 0.0001741
Kings 0.0032363
Lake 0.0014232
Lassen 0.0012838
Los Angeles 0.0000121
Madera 0.0014827
Marin 0.0004478
Mariposa 0.0006758
Mendocino 0.0011295
Merced 0.0006601
Modoc 0.0140731
Mono 0.0004793
Monterey 0.0005371
Napa 0.0006588
Nevada 0.0003618
Orange 0.0000355
Placer 0.0001973
Plumas 0.0008482
Riverside 0.0000355
Sacramento 0.0000885
San Benito 0.0035743
San Bernardino 0.0000643
San Diego 0.0000366
San Francisco 0.0001040
San Joaquin 0.0002532
San Luis Obispo 0.0003007
San Mateo 0.0001545
Santa Barbara 0.0002503
Santa Clara 0.0000592
Santa Cruz 0.0004006
Shasta 0.0001295
Sierra 0.0000000
Siskiyou 0.0009495
Solano 0.0001537
Sonoma 0.0003687
Stanislaus 0.0004175
Sutter 0.0016483
Tehama 0.0012276
Trinity 0.0013252
Tulare 0.0004762
Tuolumne 0.0010445
Ventura 0.0001375
Yolo 0.0005204
Yuba 0.0022945

Results: Covid-19 cumulative case counts are the highest in the following counties: Los Angeles, Riverside, Orange, San Bernardino, and San Diego. The highest new cases over the past 14 days are in the following counties: Imperial, Tulare, Merced, Madera, and Yuba. Despite the covid-19 increases throughout the state the following counties have shown less than 100 new cases over the past 14 days: Alpine, Calaveras, Del Norte, El Dorado, Humboldt, Lake, Mariposa, Modoc, Mono, Nevada, Plumas, Shasta, Siskiyou, Trinity, and Tuolumne. Based on these results you can see that population density plays a large role in the spread of covid-19 and more remote counties in Northern California have less current spread.

Question 2: Covid-19 New York, California, Louisiana, and Florida

Q2 = home %>%
  filter(state %in% c("New York","California", "Louisiana", "Florida")) %>%
  group_by(state, date) %>% summarise(cases = sum(cases)) %>% 
  mutate(newCases = cases - lag(cases),
         roll7 = zoo::rollmean(newCases, 7, fill = NA, align = 'right')) %>% ungroup()  
  
Q2 %>% ggplot(aes(x = date, y = newCases)) + geom_col(aes(y = newCases), col = NA, fill ="#F5B8B5") +
  geom_line(aes(y = roll7), col = "darkred", size = 1) + facet_grid(~state, scale = "free_y") +
  ggthemes::theme_wsj() + theme(legend.position = "right")

  labs(title = paste("Daily Cases in NY, CA, LA, FL")) +
  theme(plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        plot.title = element_text(size = 10, face = 'bold')) +
  theme(aspect.ratio = .5)
## NULL

Results: (Although I was unable to process the second graph I’ve studied Covid-19 data since the “Stay in Place” orders were given.) It is important to review population numbers when reflecting on Covid-19 cases to get a better picture of states that are having large peaks or dips within case numbers. Without factoring population, states with higher populations will appear to have way more covid-19 cases, but this is not the true picture. Out of the four states reviewed, California has the largest population overall, but currently does not have the highest covid-19 cases. Since the first set of graphs do not take population into account California would appear much higher than overall and Louisiana would appear to have low cases. Since Louisiana has a much lower population than California, Florida, and New York if population was factored in the Louisiana graph would show much higher per population cases.