SF package

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(sfheaders)

sf - basically a tibble, but one of the columns hold geometric / geographical information (like a shape, or point).

sfheaders - helps with transition from sf to tibble, and vice versa.

Getting data

There are tons of shapefiles online. This is one such example.

nyc = read_sf(dsn = "tl_2019_us_zcta510/", layer = 'tl_2019_us_zcta510') %>%
  mutate(borough = case_when(
    ZCTA5CE10 %in% 10001:10282 ~ "Manhattan",
    
    ZCTA5CE10 %in% 10301:10314 ~ "Staten Island",
    
    ZCTA5CE10 %in% 10451:10475 ~ "Bronx",
    
    ZCTA5CE10 %in% c(11004:11109, 11351:11697) ~ "Queens",
    
    ZCTA5CE10 %in% 11201:11256 ~ "Brooklyn",
    
    T ~ "Not NYC"
  )) %>%
  filter(borough != "Not NYC")

head(nyc, 4)
## Simple feature collection with 4 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -73.99614 ymin: 40.75928 xmax: -73.94701 ymax: 40.78104
## Geodetic CRS:  NAD83
## # A tibble: 4 × 11
##   ZCTA5CE10 GEOID10 CLASSFP10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 
##   <chr>     <chr>   <chr>     <chr>   <chr>        <dbl>    <dbl> <chr>      
## 1 10065     10065   B5        G6350   S           984654        0 +40.7646284
## 2 10069     10069   B5        G6350   S           249050        0 +40.7759598
## 3 10075     10075   B5        G6350   S           477137        0 +40.7733630
## 4 10103     10103   B5        G6350   S            24776        0 +40.7607797
## # ℹ 3 more variables: INTPTLON10 <chr>, geometry <MULTIPOLYGON [°]>,
## #   borough <chr>

Quick map!

ggplot(data = nyc) +
  geom_sf(aes(geometry = geometry), fill = "white")

Can also filter, just like a tibble!

nyc %>%
  filter(borough != "Bronx") %>%
  ggplot() +
    geom_sf(aes(geometry = geometry))

Can also color them by borough!

ggplot(data = nyc) +
  geom_sf(aes(geometry = geometry, fill = borough))

If I have some sort of numeric measurement (like, percent of population sick in the zip code), we can plot that as well.

nyc %>%
  mutate(sick = runif(n = nrow(nyc))) %>%
  ggplot() +
    geom_sf(aes(geometry = geometry, fill = sick))

Center of each zip code

Ideally, we’d want to show the population for each zip code as number in the center of the polygon.

nyc = nyc %>%
 mutate(centers = st_centroid(nyc)$geometry,
        people = round(runif(nrow(nyc), max = 100, min = 20)))
## Warning: There was 1 warning in `stopifnot()`.
## ℹ In argument: `centers = st_centroid(nyc)$geometry`.
## Caused by warning:
## ! st_centroid assumes attributes are constant over geometries
head(nyc, 4)
## Simple feature collection with 4 features and 11 fields
## Active geometry column: geometry
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -73.99614 ymin: 40.75928 xmax: -73.94701 ymax: 40.78104
## Geodetic CRS:  NAD83
## # A tibble: 4 × 13
##   ZCTA5CE10 GEOID10 CLASSFP10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 
##   <chr>     <chr>   <chr>     <chr>   <chr>        <dbl>    <dbl> <chr>      
## 1 10065     10065   B5        G6350   S           984654        0 +40.7646284
## 2 10069     10069   B5        G6350   S           249050        0 +40.7759598
## 3 10075     10075   B5        G6350   S           477137        0 +40.7733630
## 4 10103     10103   B5        G6350   S            24776        0 +40.7607797
## # ℹ 5 more variables: INTPTLON10 <chr>, geometry <MULTIPOLYGON [°]>,
## #   borough <chr>, centers <POINT [°]>, people <dbl>
nyc %>%
  mutate(sick = runif(n = nrow(nyc))) %>%
  filter(borough == "Brooklyn") %>%
  ggplot() +
    geom_sf(aes(geometry = geometry, fill = sick)) +
    geom_sf_text(aes(geometry = centers, label = people), size = 2, color = "orange")
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

We can try fixing the centers so they are on the polygon (useful in the case of islands, for example)

nyc = nyc %>%
 mutate(centers = st_point_on_surface(nyc)$geometry,
        people = round(runif(nrow(nyc), max = 100, min = 20)))
## Warning: There were 2 warnings in `stopifnot()`.
## The first warning was:
## ℹ In argument: `centers = st_point_on_surface(nyc)$geometry`.
## Caused by warning:
## ! st_point_on_surface assumes attributes are constant over geometries
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
nyc %>%
  mutate(sick = runif(n = nrow(nyc))) %>%
  filter(borough == "Brooklyn") %>%
  ggplot() +
    geom_sf(aes(geometry = geometry, fill = sick)) +
    geom_sf_text(aes(geometry = centers, label = people), size = 2, color = "orange")
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

Finding the zip code of points

Sometimes, we may get the specific coordinates and want to know the zip-code/borough/state where the coordinate is on.

N <- 100
points =   data.frame(x = rnorm(N, mean = -73.9, sd = 0.2), 
                 y = rnorm(N, mean = 40.7, sd = 0.1))

#source: https://stackoverflow.com/questions/68247417/r-how-to-create-points-using-st-point-applied-to-columns-of-data-frame
points = points %>%
  as.matrix() %>%
  st_multipoint() %>%
  st_sfc() %>%
  st_cast('POINT') %>%
  st_as_sf(crs = st_crs(nyc)) #in this step, we are specifying the 
                              #projection used

Let’s plot it:

ggplot() +
  geom_sf(data = nyc, aes(geometry = geometry), fill = "white") +
  geom_sf(data = points)

Now, we can figure out the zip code and borough for each of the points:

points = points %>%
          st_join(nyc, join = st_within) %>%
          select(ZCTA5CE10, borough, geometry)

points
## Simple feature collection with 100 features and 2 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.33807 ymin: 40.42841 xmax: -73.57313 ymax: 40.91527
## Geodetic CRS:  NAD83
## First 10 features:
##    ZCTA5CE10  borough                   geometry
## 1      11413   Queens  POINT (-73.75869 40.6705)
## 2      11372   Queens POINT (-73.89207 40.75123)
## 3      11222 Brooklyn  POINT (-73.9317 40.72833)
## 4      11101   Queens POINT (-73.93245 40.73845)
## 5       <NA>     <NA> POINT (-73.81224 40.80048)
## 6      11413   Queens  POINT (-73.74386 40.6714)
## 7       <NA>     <NA> POINT (-74.09624 40.67627)
## 8      11233 Brooklyn POINT (-73.92832 40.67963)
## 9       <NA>     <NA> POINT (-74.15125 40.80725)
## 10      <NA>     <NA> POINT (-73.98581 40.89503)

Bonus: sampling from geometries

points = st_sample(nyc, size = 1000)

ggplot() +
  geom_sf(data = nyc, aes(geometry = geometry), fill = "white") +
  geom_sf(data = points)