library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(sfheaders)
sf - basically a tibble, but one of the columns hold geometric / geographical information (like a shape, or point).
sfheaders - helps with transition from sf to tibble, and vice versa.
There are tons of shapefiles online. This is one such example.
nyc = read_sf(dsn = "tl_2019_us_zcta510/", layer = 'tl_2019_us_zcta510') %>%
mutate(borough = case_when(
ZCTA5CE10 %in% 10001:10282 ~ "Manhattan",
ZCTA5CE10 %in% 10301:10314 ~ "Staten Island",
ZCTA5CE10 %in% 10451:10475 ~ "Bronx",
ZCTA5CE10 %in% c(11004:11109, 11351:11697) ~ "Queens",
ZCTA5CE10 %in% 11201:11256 ~ "Brooklyn",
T ~ "Not NYC"
)) %>%
filter(borough != "Not NYC")
head(nyc, 4)
## Simple feature collection with 4 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.99614 ymin: 40.75928 xmax: -73.94701 ymax: 40.78104
## Geodetic CRS: NAD83
## # A tibble: 4 × 11
## ZCTA5CE10 GEOID10 CLASSFP10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 10065 10065 B5 G6350 S 984654 0 +40.7646284
## 2 10069 10069 B5 G6350 S 249050 0 +40.7759598
## 3 10075 10075 B5 G6350 S 477137 0 +40.7733630
## 4 10103 10103 B5 G6350 S 24776 0 +40.7607797
## # ℹ 3 more variables: INTPTLON10 <chr>, geometry <MULTIPOLYGON [°]>,
## # borough <chr>
ggplot(data = nyc) +
geom_sf(aes(geometry = geometry), fill = "white")
Can also filter, just like a tibble!
nyc %>%
filter(borough != "Bronx") %>%
ggplot() +
geom_sf(aes(geometry = geometry))
Can also color them by borough!
ggplot(data = nyc) +
geom_sf(aes(geometry = geometry, fill = borough))
If I have some sort of numeric measurement (like, percent of population sick in the zip code), we can plot that as well.
nyc %>%
mutate(sick = runif(n = nrow(nyc))) %>%
ggplot() +
geom_sf(aes(geometry = geometry, fill = sick))
Ideally, we’d want to show the population for each zip code as number in the center of the polygon.
nyc = nyc %>%
mutate(centers = st_centroid(nyc)$geometry,
people = round(runif(nrow(nyc), max = 100, min = 20)))
## Warning: There was 1 warning in `stopifnot()`.
## ℹ In argument: `centers = st_centroid(nyc)$geometry`.
## Caused by warning:
## ! st_centroid assumes attributes are constant over geometries
head(nyc, 4)
## Simple feature collection with 4 features and 11 fields
## Active geometry column: geometry
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.99614 ymin: 40.75928 xmax: -73.94701 ymax: 40.78104
## Geodetic CRS: NAD83
## # A tibble: 4 × 13
## ZCTA5CE10 GEOID10 CLASSFP10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 10065 10065 B5 G6350 S 984654 0 +40.7646284
## 2 10069 10069 B5 G6350 S 249050 0 +40.7759598
## 3 10075 10075 B5 G6350 S 477137 0 +40.7733630
## 4 10103 10103 B5 G6350 S 24776 0 +40.7607797
## # ℹ 5 more variables: INTPTLON10 <chr>, geometry <MULTIPOLYGON [°]>,
## # borough <chr>, centers <POINT [°]>, people <dbl>
nyc %>%
mutate(sick = runif(n = nrow(nyc))) %>%
filter(borough == "Brooklyn") %>%
ggplot() +
geom_sf(aes(geometry = geometry, fill = sick)) +
geom_sf_text(aes(geometry = centers, label = people), size = 2, color = "orange")
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data
We can try fixing the centers so they are on the polygon (useful in the case of islands, for example)
nyc = nyc %>%
mutate(centers = st_point_on_surface(nyc)$geometry,
people = round(runif(nrow(nyc), max = 100, min = 20)))
## Warning: There were 2 warnings in `stopifnot()`.
## The first warning was:
## ℹ In argument: `centers = st_point_on_surface(nyc)$geometry`.
## Caused by warning:
## ! st_point_on_surface assumes attributes are constant over geometries
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
nyc %>%
mutate(sick = runif(n = nrow(nyc))) %>%
filter(borough == "Brooklyn") %>%
ggplot() +
geom_sf(aes(geometry = geometry, fill = sick)) +
geom_sf_text(aes(geometry = centers, label = people), size = 2, color = "orange")
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data
Sometimes, we may get the specific coordinates and want to know the zip-code/borough/state where the coordinate is on.
N <- 100
points = data.frame(x = rnorm(N, mean = -73.9, sd = 0.2),
y = rnorm(N, mean = 40.7, sd = 0.1))
#source: https://stackoverflow.com/questions/68247417/r-how-to-create-points-using-st-point-applied-to-columns-of-data-frame
points = points %>%
as.matrix() %>%
st_multipoint() %>%
st_sfc() %>%
st_cast('POINT') %>%
st_as_sf(crs = st_crs(nyc)) #in this step, we are specifying the
#projection used
Let’s plot it:
ggplot() +
geom_sf(data = nyc, aes(geometry = geometry), fill = "white") +
geom_sf(data = points)
Now, we can figure out the zip code and borough for each of the points:
points = points %>%
st_join(nyc, join = st_within) %>%
select(ZCTA5CE10, borough, geometry)
points
## Simple feature collection with 100 features and 2 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -74.33807 ymin: 40.42841 xmax: -73.57313 ymax: 40.91527
## Geodetic CRS: NAD83
## First 10 features:
## ZCTA5CE10 borough geometry
## 1 11413 Queens POINT (-73.75869 40.6705)
## 2 11372 Queens POINT (-73.89207 40.75123)
## 3 11222 Brooklyn POINT (-73.9317 40.72833)
## 4 11101 Queens POINT (-73.93245 40.73845)
## 5 <NA> <NA> POINT (-73.81224 40.80048)
## 6 11413 Queens POINT (-73.74386 40.6714)
## 7 <NA> <NA> POINT (-74.09624 40.67627)
## 8 11233 Brooklyn POINT (-73.92832 40.67963)
## 9 <NA> <NA> POINT (-74.15125 40.80725)
## 10 <NA> <NA> POINT (-73.98581 40.89503)
points = st_sample(nyc, size = 1000)
ggplot() +
geom_sf(data = nyc, aes(geometry = geometry), fill = "white") +
geom_sf(data = points)