Ian Lyttle 2020-09-02
The purpose of this document is to wrangle the data into useful forms.
We will write out two data frames: iowa_counties
and
iowa_county_population
.
library("fs")
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1.9000 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("lubridate")
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("readxl")
library("rvest")
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
dir_source <- path("data", "download")
dir_target <- path("data", "wrangle")
dir_create(dir_target)
iowa_county_population <-
read_xls(
path = path(dir_source, "iowa_counties_population.xls"),
sheet = "Counties",
range = "A7:M107"
) %>%
transmute(
fips = Fips,
county = str_replace(Area, " County, Iowa", ""),
population = `2019`
) %>%
filter(fips > 19) %>%
arrange(population) %>%
mutate(
cumulative_poulation = cumsum(population),
quartile_population = cumulative_poulation/max(cumulative_poulation),
population_group = cut(
quartile_population,
breaks = c(0, 0.25, 0.50, 0.78, 1),
labels = c("small", "mid-small", "mid-large", "large")
# labels = c("FT-AA", "PW-HN", "ST-DA", "PK-LN")
),
population_group = fct_rev(population_group)
) %>%
arrange(desc(cumulative_poulation)) %>%
print()
## # A tibble: 99 x 6
## fips county population cumulative_poula… quartile_popula… population_group
## <dbl> <chr> <dbl> <dbl> <dbl> <fct>
## 1 19153 Polk 490161 3155070 1 large
## 2 19113 Linn 226706 2664909 0.845 large
## 3 19163 Scott 172943 2438203 0.773 mid-large
## 4 19103 Johnson 151140 2265260 0.718 mid-large
## 5 19013 Black H… 131228 2114120 0.670 mid-large
## 6 19193 Woodbury 103107 1982892 0.628 mid-large
## 7 19061 Dubuque 97311 1879785 0.596 mid-large
## 8 19169 Story 97117 1782474 0.565 mid-large
## 9 19049 Dallas 93453 1685357 0.534 mid-large
## 10 19155 Pottawa… 93206 1591904 0.505 mid-large
## # … with 89 more rows
Let’s read in the county data:
us_counties <-
read_csv(path(dir_source, "us-counties.csv"))
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## county = col_character(),
## state = col_character(),
## fips = col_character(),
## cases = col_double(),
## deaths = col_double()
## )
For now, let’s get the Iowa data, only a few of the columns, and we will create a column to note the type of aggregation (“none” for the county-level data).
iowa_counties_nyt <-
us_counties %>%
filter(state == "Iowa") %>%
select(date, county, cases, deaths)
date_scrape <-
file_info(path(dir_source, "access.html")) %>%
`[[`("change_time") %>%
max() %>%
as.Date()
html_scrape <-
read_html(path(dir_source, "access.html"))
iowa_counties_content <-
html_scrape %>%
html_nodes("td") %>%
map_chr(html_text)
ind_counties <- which(iowa_counties_content %in% iowa_county_population$county)
iowa_counties_scrape <-
tibble(
date = date_scrape,
county = iowa_counties_content[ind_counties],
cases = as.numeric(iowa_counties_content[ind_counties + 2]),
deaths = as.numeric(iowa_counties_content[ind_counties + 4])
) %>%
mutate(
cases = ifelse(is.na(cases), 0, cases),
deaths = ifelse(is.na(deaths), 0, deaths),
)
iowa_counties_combined <- iowa_counties_nyt
if (date_scrape > max(iowa_counties_nyt$date)) {
iowa_counties_combined <- bind_rows(iowa_counties_nyt, iowa_counties_scrape)
}
iowa_counties <-
iowa_counties_combined %>%
filter(county %in% iowa_county_population$county) %>%
group_by(county) %>%
arrange(desc(date)) %>% # take care of decreasing-counts
mutate(
cases = cummin(cases),
deaths = cummin(deaths)
) %>%
arrange(date) %>%
mutate(
new_cases = cases - lag(cases, default = 0),
new_deaths = deaths - lag(deaths, default = 0),
new_cases_week_avg = (cases - lag(cases, n = 7, default = 0)) / 7,
new_deaths_week_avg = (deaths - lag(deaths, n = 7, default = 0)) / 7,
aggregation = "none"
) %>%
ungroup() %>%
arrange(desc(date), desc(cases)) %>%
print()
## # A tibble: 15,145 x 9
## date county cases deaths new_cases new_deaths new_cases_week_…
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-09-02 Polk 13511 226 46 0 203.
## 2 2020-09-02 Woodb… 4157 56 5 0 21.1
## 3 2020-09-02 Johns… 4098 26 47 0 165.
## 4 2020-09-02 Black… 3830 76 0 0 35.7
## 5 2020-09-02 Linn 3005 94 12 0 25
## 6 2020-09-02 Story 2549 16 15 0 97
## 7 2020-09-02 Dallas 2391 38 5 0 31.9
## 8 2020-09-02 Scott 2254 21 5 0 27.6
## 9 2020-09-02 Dubuq… 2062 36 13 0 17.7
## 10 2020-09-02 Buena… 1828 12 1 0 0.857
## # … with 15,135 more rows, and 2 more variables: new_deaths_week_avg <dbl>,
## # aggregation <chr>
write_csv(iowa_counties, path(dir_target, "iowa-counties.csv"))
write_csv(iowa_county_population, path(dir_target, "iowa-county-population.csv"))