COVID-19 Data
Data
https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv
Libraries
library(tidyverse)
library(knitr)
library(readxl)
library(zoo)
Start
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
home = read_csv(url)
read_excel = '../data/PopulationEstimates.xls'
pop = readxl::read_excel("../data/PopulationEstimates.xls", skip = 2) %>%
select(pop2019 = POP_ESTIMATE_2019, fips = FIPStxt, State)
CA_pop_2019 = pop %>%
filter(State == "CA")
Question 1: Covid-19 California Cases
#Step 1
home = read_csv(url)
#Step 2
steptwo = home %>% filter(state == "California") %>% group_by(county) %>%
mutate(newCase = cases - lag(cases)) %>% arrange(-newCase) %>% ungroup()
#Step 3
#Table 1: 5 counties with most cumulative cases
most_cumulative_cases = steptwo %>% filter(date == max(date)) %>% group_by(county) %>%
summarize(sum_cases = sum(cases)) %>% arrange(-sum_cases) %>%
ungroup() %>% slice_max(sum_cases, n = 5)
knitr::kable(most_cumulative_cases, caption = c("Cumulative Case Counts: Top 5 CA counties"),
col.names = c("County", "Cumulative Cases"))
Cumulative Case Counts: Top 5 CA counties
Los Angeles |
253985 |
Riverside |
55073 |
Orange |
52121 |
San Bernardino |
50699 |
San Diego |
42742 |
#Step 2 (Table 2)
steptwo = home %>% filter(state == "California") %>% group_by(county) %>%
mutate(newCase = cases - lag(cases)) %>% arrange(-newCase) %>% ungroup()
#Step 3 (Table 2)
#Table 2: 5 counties with most new cases
most_new_cases = steptwo %>% filter(date == max(date)) %>% slice_max(newCase, n=5) %>% select (county, newCase)
knitr::kable(most_new_cases, caption = "New Case Counts: Top 5 CA counties", col.names = c("County", "New Cases"))
New Case Counts: Top 5 CA counties
Los Angeles |
809 |
San Diego |
265 |
Orange |
185 |
Fresno |
159 |
San Bernardino |
156 |
#Cumulative Cases with Population (100,000)
pop = readxl::read_excel("../data/PopulationEstimates.xls", skip = 2) %>%
select(pop2019 = "POP_ESTIMATE_2019", fips = "FIPStxt", state = "State")
CA_pop_covid = right_join(pop, home, by ="fips") %>%
filter(date >= max(date, na.rm = TRUE)-13, state.x == "CA") %>%
group_by(county) %>%
mutate(newCase = cases - lag(cases),
newCase_pc = newCase / pop2019,
cumulative_pc = cases / pop2019) %>%
ungroup()
today = CA_pop_covid %>%
filter(date == max(date))
most_per_cap_today = today %>%
slice_max(newCase_pc, n = 5)
most_per_cap_all_time = today %>%
slice_max(cumulative_pc, n = 5) %>%
select(county = county, cumulative_pc)
#Table 1
pop_most_cumulative_cases = CA_pop_covid %>%
filter(date == max(date)) %>%
slice_max(cases, n=5) %>%
select(county, cumulative_pc)
knitr::kable(pop_most_cumulative_cases, caption = c("Pop Cumulative Case Counts: Top 5 CA counties"),
col.names = c("County", "Cumulative Cases with Pop (100,000)"))
Pop Cumulative Case Counts: Top 5 CA counties
Los Angeles |
0.0252996 |
Riverside |
0.0222918 |
Orange |
0.0164125 |
San Bernardino |
0.0232555 |
San Diego |
0.0128034 |
#Table 2
pop_most_new_cases = CA_pop_covid %>% filter(date == max(date)) %>% slice_max(newCase_pc, n=5) %>%
select(county, newCase_pc)
knitr::kable(pop_most_new_cases, caption = c("Pop New Case Counts: Top 5 CA counties"),
col.names = c("County", "New Cases with Pop (100,000)"))
Pop New Case Counts: Top 5 CA counties
Kings |
0.0002615 |
San Benito |
0.0002388 |
Monterey |
0.0002027 |
Lake |
0.0001708 |
Fresno |
0.0001591 |
#Table 3: Safe counties
pop_safe_county = CA_pop_covid %>%
group_by(county) %>%
summarize(totNew = sum(newCase_pc, na.rm= TRUE) / (max(pop2019) /100000)) %>%
filter(totNew <= 100)
knitr::kable(pop_safe_county, caption = c("Pop Lowest Case Counts: CA counties"),
col.names = c("County", "Lowest New Cases with Pop"))
Pop Lowest Case Counts: CA counties
Alameda |
0.0000646 |
Alpine |
0.0000000 |
Amador |
0.0024047 |
Butte |
0.0012551 |
Calaveras |
0.0029422 |
Colusa |
0.0094772 |
Contra Costa |
0.0001062 |
Del Norte |
0.0007757 |
El Dorado |
0.0001802 |
Fresno |
0.0002117 |
Glenn |
0.0099236 |
Humboldt |
0.0003211 |
Imperial |
0.0016901 |
Inyo |
0.0052243 |
Kern |
0.0001741 |
Kings |
0.0032363 |
Lake |
0.0014232 |
Lassen |
0.0012838 |
Los Angeles |
0.0000121 |
Madera |
0.0014827 |
Marin |
0.0004478 |
Mariposa |
0.0006758 |
Mendocino |
0.0011295 |
Merced |
0.0006601 |
Modoc |
0.0140731 |
Mono |
0.0004793 |
Monterey |
0.0005371 |
Napa |
0.0006588 |
Nevada |
0.0003618 |
Orange |
0.0000355 |
Placer |
0.0001973 |
Plumas |
0.0008482 |
Riverside |
0.0000355 |
Sacramento |
0.0000885 |
San Benito |
0.0035743 |
San Bernardino |
0.0000643 |
San Diego |
0.0000366 |
San Francisco |
0.0001040 |
San Joaquin |
0.0002532 |
San Luis Obispo |
0.0003007 |
San Mateo |
0.0001545 |
Santa Barbara |
0.0002503 |
Santa Clara |
0.0000592 |
Santa Cruz |
0.0004006 |
Shasta |
0.0001295 |
Sierra |
0.0000000 |
Siskiyou |
0.0009495 |
Solano |
0.0001537 |
Sonoma |
0.0003687 |
Stanislaus |
0.0004175 |
Sutter |
0.0016483 |
Tehama |
0.0012276 |
Trinity |
0.0013252 |
Tulare |
0.0004762 |
Tuolumne |
0.0010445 |
Ventura |
0.0001375 |
Yolo |
0.0005204 |
Yuba |
0.0022945 |
Results: Covid-19 cumulative case counts are the highest in the following counties: Los Angeles, Riverside, Orange, San Bernardino, and San Diego. The highest new cases over the past 14 days are in the following counties: Imperial, Tulare, Merced, Madera, and Yuba. Despite the covid-19 increases throughout the state the following counties have shown less than 100 new cases over the past 14 days: Alpine, Calaveras, Del Norte, El Dorado, Humboldt, Lake, Mariposa, Modoc, Mono, Nevada, Plumas, Shasta, Siskiyou, Trinity, and Tuolumne. Based on these results you can see that population density plays a large role in the spread of covid-19 and more remote counties in Northern California have less current spread.
Question 2: Covid-19 New York, California, Louisiana, and Florida
Q2 = home %>%
filter(state %in% c("New York","California", "Louisiana", "Florida")) %>%
group_by(state, date) %>% summarise(cases = sum(cases)) %>%
mutate(newCases = cases - lag(cases),
roll7 = zoo::rollmean(newCases, 7, fill = NA, align = 'right')) %>% ungroup()
Q2 %>% ggplot(aes(x = date, y = newCases)) + geom_col(aes(y = newCases), col = NA, fill ="#F5B8B5") +
geom_line(aes(y = roll7), col = "darkred", size = 1) + facet_grid(~state, scale = "free_y") +
ggthemes::theme_wsj() + theme(legend.position = "right")
labs(title = paste("Daily Cases in NY, CA, LA, FL")) +
theme(plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
plot.title = element_text(size = 10, face = 'bold')) +
theme(aspect.ratio = .5)
## NULL
Results: (Although I was unable to process the second graph I’ve studied Covid-19 data since the “Stay in Place” orders were given.) It is important to review population numbers when reflecting on Covid-19 cases to get a better picture of states that are having large peaks or dips within case numbers. Without factoring population, states with higher populations will appear to have way more covid-19 cases, but this is not the true picture. Out of the four states reviewed, California has the largest population overall, but currently does not have the highest covid-19 cases. Since the first set of graphs do not take population into account California would appear much higher than overall and Louisiana would appear to have low cases. Since Louisiana has a much lower population than California, Florida, and New York if population was factored in the Louisiana graph would show much higher per population cases.