Note that postmastr is not available on CRAN:
::install_github("slu-openGIS/postmastr")
remotes
<- c( "tidyverse","stringi","tidycensus",
p "textclean","tidygeocoder" )
install.packages(p)
library( postmastr )
library( tidyverse )
library( stringi )
library( tidycensus )
library( textclean )
library( tidygeocoder )
postmastr
PackageBasic Organization of postmastr
To parse our grammar of street addresses, functions can be grouped in two ways. All functions begin with the prefix pm_ in order to take advantage of RStudio’s auto-complete functionality.
First, we have major groups of functions based on their associated grammatical element:
For each group of function, we have a similar menu of options that describe the verb (action) the function implements. For the state family of functions, for instance:
Thanks to Akila Forde for contributions to this section.
The postmastr
packages utilizes a collection of
“dictionaries” that are used for standardization and disambiguation
purposes.
<- pm_dictionary( type = "directional",
x.dir locale = "us",
filter = c("N", "S", "E", "W", "NE", "NW", "SW", "SE"))
head( x.dir, 10 )
<- pm_dictionary( type = "state",
x.state case = c("upper"),
locale = "us")
head( x.state, 10 )
NOTE: The city dictionary functionality is powered by the get_acs function from the tidycensus package. This requires a Census Bureau API key, which can be obtained at http://api.census.gov/data/key_signup.html. Once you have a key, the census_api_key function from tidycensus should be used to set-up the key before proceeding with the creation of any dictionary objects for cities. (Excludes Armed Americas, Armed Americas Europe Middle East etc)
::census_api_key( "yourkeyhere" ) tidycensus
<- function(append, states){
af_pm_dictionary_us_cities
<- af_pm_get_tidycensus(states = states)
out
# optionally append
if (missing(append) == FALSE){
# bind rows
<- dplyr::bind_rows(out, append)
out
# re-order observations
<- out[order(out$city.input),]
out
}
# return output
return(out)
}
# us cities via tidycensus
<- function(states){
af_pm_get_tidycensus
# global bindings
= NAME = NULL
state.abb
# download data
%>%
states ::split(states) %>%
base::map_df(~ suppressMessages(
purrr::get_acs(year = 2020, state = .x, geography = "place", variable = "B01003_001"))) -> data
tidycensus
# create state dictionary
<- data.frame(
dict state.name = c(datasets::state.name),
state.abb = c(datasets::state.abb),
stringsAsFactors = FALSE
)
<- dplyr::filter(dict, state.abb %in% states)
dict <- dict$state.name
dict
# parse state names
%>%
data ::select(NAME) %>%
dplyr::mutate(NAME = stringr::str_replace_all(NAME, pattern = ",", replace = "")) %>%
dplyrpm_parse_place(dictionary = dict) %>%
::mutate(NAME = stringr::str_trim(NAME, side = "right")) -> data
dplyr
# create directory
<- c("city", "town", "village", "CDP")
dict
# parse place types
%>%
data pm_parse_place(dictionary = dict) -> data
# clean-up output
%>%
data ::mutate(NAME = stringr::str_trim(NAME, side = "right")) %>%
dplyr::distinct(NAME, .keep_all = TRUE) %>%
dplyr::rename(city.input = NAME) -> data
dplyr
# re-order observations
<- data[order(data$city.input),]
data
# return output
return(data)
}
<- function(.data, dictionary){
pm_parse_place
# global bindings
= pm.place = NULL
NAME
# iterate over observations
%>%
.data ::mutate(pm.place = purrr::map(NAME, ~ pm_extract_pattern(.x, dictionary = dictionary, end = TRUE))) -> .data
dplyr
# clean address data
%>%
.data ::unnest(pm.place) %>%
tidyr::filter(is.na(pm.place) == FALSE) %>%
dplyr::mutate(pm.place = as.character(pm.place)) %>%
dplyr::mutate(NAME = stringr::str_replace(NAME,
dplyrpattern = stringr::str_c("\\b", pm.place, "\\b$"),
replacement = "")) %>%
::select(-pm.place) -> .data
dplyr
return(.data)
}
# iterate over dictionary items per observations
<- function(x, dictionary, end = TRUE){
pm_extract_pattern
# create pattern vector
<- dictionary
patternVector
%>%
patternVector ::split(patternVector) %>%
base::map( ~ stringr::str_extract(x, pattern = ifelse (end == TRUE,
purrr::str_c("\\b", .x, "\\b$"),
stringr::str_c("\\b", .x, "\\b")))) -> out
stringr
return(out)
}
# takes a bit to process
<- af_pm_dictionary_us_cities(,state.abb) %>% mutate_if(is.character, str_to_upper)
x.city head( x.city )
<-
address c(
"1 WEST 54TH ST NO 38, NEW YORK, NY 10019",
"202 FLANDERS DRAKESTOWN ROAD, FLANDERS, NJ 7836",
"PO BOX 840, ARLINGTON, MN 55307",
"7211 HAVEN AVENUE SUITE E-565, RANCHO CUCAMONGA, CA 91701",
"18810 U S HWY 41, MASARYKTOWN, FL 34604",
"1200 W INTERNATIONAL SPEEDWAY BLVD, DAYTONA BEACH, FL 32114",
"16105 SWINGLEY RIDGE ROAD UNIT 773, CHESTERFIELD, MO 63017",
"C/O EVAN RUSSO 57 TAYLOR BROOK RD, HANCOCK, VT 5748",
"2917 E 47TH ST APT B, INDIANAPOLIS, IN 46205",
"3280 N 960 E, LEHI, UT 84043",
"702 SECOND AVE E, OSKALOOSA, IA 52577",
"3875 POWDER SPRINGS RD STE C, POWDER SPRINGS, GA 30127",
"1730 GIDDINGS AVENUE SOUTHEAST, GRAND RAPIDS, MI 49507",
"PO BOX 74, TAHOLAH, WA 98587",
"BOX 444, INKSTER, MI 48141",
"1990 NE 163RD STREET - SUITE 233, NORTH MIAMI BEACH, FL 33162",
"ONE BEAR PLACE 98006, WACO, TX 76798",
"3023 S UNIVERSITY DRIVE 103, FORT WORTH, TX 76109",
"POST OFFICE BOX 6037, ALAMEDA, CA 94501"
)
Missing numbers in zip codes:
c(2,8)] address[
## [1] "202 FLANDERS DRAKESTOWN ROAD, FLANDERS, NJ 7836"
## [2] "C/O EVAN RUSSO 57 TAYLOR BROOK RD, HANCOCK, VT 5748"
# Assumes the zipcodes are always the last word in the addess, code below has infrastructure if no zip
<- stri_extract_last_words( address )
x.last_words
## Loop to add a zero in front of any shortened zipcodes, only if the zipcode is four digits long
for ( i in 1:length( x.last_words ) ){
if ( nchar( x.last_words[i]) == 4 )
{<-
address[i] paste( word( address[i], 1,-2 ),
paste0( "0", x.last_words[i] ) )
}
}
c(2,8)] address[
## [1] "202 FLANDERS DRAKESTOWN ROAD, FLANDERS, NJ 07836"
## [2] "C/O EVAN RUSSO 57 TAYLOR BROOK RD, HANCOCK, VT 05748"
# requires a data frame
<- paste0( "id-", 1:length(address) )
id <- data.frame( id, address )
d.address head( d.address )
## Need to ensure the data have a unique identification
<- pm_identify( d.address, var="address" )
d.address head( d.address )
## Parse the data for cities, and address
<-
d.parsed %>%
d.address pm_parse( input='full',
address="address",
output = "short",
keep_parsed = "limited",
dir_dict=x.dir,
state_dict=x.state,
city_dict=x.city )
<- d.parsed %>% mutate_if( is.character, str_to_upper )
d.parsed d.parsed
tidygeocoder
https://jessecambon.github.io/tidygeocoder/
library(tibble)
library(dplyr)
library(tidygeocoder)
<- tibble(singlelineaddress = c(
address_single "11 Wall St, NY, NY",
"600 Peachtree Street NE, Atlanta, Georgia"
))
<- tribble(
address_components ~street, ~cty, ~st,
"11 Wall St", "NY", "NY",
"600 Peachtree Street NE", "Atlanta", "GA"
)
<- address_single %>%
census_s1 geocode(address = singlelineaddress, method = "census", verbose = TRUE)
#>
#> Number of Unique Addresses: 2
#> Executing batch geocoding...
#> Batch limit: 10,000
#> Passing 2 addresses to the US Census batch geocoder
#> Querying API URL: https://geocoding.geo.census.gov/geocoder/locations/addressbatch
#> Passing the following parameters to the API:
#> format : "json"
#> benchmark : "Public_AR_Current"
#> vintage : "Current_Current"
#> Query completed in: 1 seconds
<- geo(
osm_s1 address = address_single$singlelineaddress, method = "osm",
lat = latitude, long = longitude
)#> Passing 2 addresses to the Nominatim single address geocoder
#> Query completed in: 2 seconds
geo( ..., method="osm" )
method = the geocoding service used in the call.
Cascade Option:
The “cascade” method first uses one geocoding service and then uses a second geocoding service if the first service didn’t return results. The services and order is specified by the cascade_order argument. Note that this is not compatible with full_results = TRUE as geocoding services have different columns that they return.
Primary Considerations:
Getting Keys
It is different for every service, but for some it is a simple request:
https://api.census.gov/data/key_signup.html
Installing Keys
API keys are loaded from R environmental variables.
::edit_r_environ()
usethis# add:
="YourAPIKeyHere" GEOCODIO_API_KEY
Run usethis::edit_r_environ() to open your .Renviron file and add an API key as an environmental variable. For example, add the line GEOCODIO_API_KEY=“YourAPIKeyHere”
https://nonprofit-open-data-collective.github.io/open-1023-ez-dataset/
<-
d structure(list(id = c("ID-2014-020783630-01", "ID-2014-020783630-02",
"ID-2014-020783630-03", "ID-2014-020783630-04", "ID-2014-020783630-05"
lon = c(-102.14035, -102.10558, -102.1047, -102.1652, -102.1161
), lat = c(32.028698, 32.0139, 32.04242, 32.009197, 32.04332)), row.names = 203:207, class = "data.frame") ),
library( geosphere )
# distm returns a vector of distances in meters
<-d[c("lon","lat")]
dat rownames( dat ) <- d$id
distm( dat, fun = distGeo ) # in meters
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.000 3671.822 3695.200 3191.799 2806.350
## [2,] 3671.822 0.000 3163.598 5657.064 3410.306
## [3,] 3695.200 3163.598 0.000 6799.691 1081.335
## [4,] 3191.799 5657.064 6799.691 0.000 5985.894
## [5,] 2806.350 3410.306 1081.335 5985.894 0.000
# in miles
distm( dat, fun = distGeo ) * 0.001 * 0.621371
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.000000 2.281563 2.2960903 1.983291 1.7437848
## [2,] 2.281563 0.000000 1.9657682 3.515135 2.1190654
## [3,] 2.296090 1.965768 0.0000000 4.225131 0.6719105
## [4,] 1.983291 3.515135 4.2251308 0.000000 3.7194609
## [5,] 1.743785 2.119065 0.6719105 3.719461 0.0000000
###
### CONVERT LATITUDE AND LONGITUDE COORDINATES TO KILOMETERS
###
# degrees to radians
<- function(deg)
deg2rad return(deg*pi/180) }
{
# Calculates the geodesic distance between two points specified by
# latitude/longitude using the Haversine formula.
# Note that lat/long are converted to radians so they
# can be passed to sin/cos functions.
<- function(long1, lat1, long2, lat2)
gcd.hf
{
<- 6371 # R is a constant for the Earth mean radius [km]
R
<- deg2rad(long2 - long1)
delta.long <- deg2rad(lat2 - lat1)
delta.lat
<- sin(delta.lat/2)^2 + cos(deg2rad(lat1)) * cos(deg2rad(lat2)) * sin(delta.long/2)^2
a <- 2 * asin(min(1,sqrt(a)))
c = R * c
d
return(d) # Distance in km
}
# gcd.hf(long1=-86.64147, lat1=32.54221, long2=-87.75474, lat2=30.65488)
# should be 234.9
<- function( latlongs, toMiles=T )
calcDists
{ <- list(rownames(latlongs), rownames(latlongs))
name <- nrow(latlongs)
n <- matrix(0, n, n, dimnames = name)
z
for (i in 1:n)
{ for (j in 1:n)
{<- gcd.hf( long1 = latlongs[i, 2], lat1 = latlongs[i,1],
z[i, j] long2 = latlongs[j, 2], lat2 = latlongs[j,1] )
}
}
if( toMiles == T )
<- z * 0.621371192 }
{ z
return(z)
}
https://www.jla-data.net/eng/routing-in-r-context/
https://github.com/lecy/CityBikeNYC
library( ggmap )
<- route( from=c(-73.96905,40.75002),
rt to=c(-73.99148,40.72229),
mode="bicycling",
structure="route" )
<- qmap( "New York City, NY", color='bw', zoom=13 )
nyc
+ geom_path( aes( x = rt$lon , y = rt$lat ),
nyc colour="#1E2B6A", data=rt, alpha=1, size=2 )
rt
m km miles seconds minutes hours leg lon lat1 96 0.096 0.0596544 12 0.2000000 0.003333333 1 -73.96898 40.74999
2 228 0.228 0.1416792 83 1.3833333 0.023055556 2 -73.96842 40.75075
3 3510 3.510 2.1811140 703 11.7166667 0.195277778 3 -73.97079 40.75174
4 42 0.042 0.0260988 7 0.1166667 0.001944444 4 -73.99096 40.72412
5 158 0.158 0.0981812 70 1.1666667 0.019444444 5 -73.99103 40.72375
6 15 0.015 0.0093210 23 0.3833333 0.006388889 6 -73.99165 40.72241
7 8 0.008 0.0049712 22 0.3666667 0.006111111 7 -73.99149 40.72237
8 NA NA NA NA NA NA NA -73.99152 40.72230
To add Census data you either need (1) FIPS codes for your geographic units like counties, tracts, blocks, etc. or (2) latitude and longitude coordinates used for a spatial join.
Upload your file with lat-lon coordinates to the IPUMS website and they will return a file with ~20 census fields appended to the dataset.
library( dplyr )
library( sf )
library( tigris )
### Spatial join appending polygon data frame to any
### points contained within the polygons
### sf::st_join( d.sf, tracts, join = st_within, left=FALSE )
### Download tract shapefiles from Census
### tigris::tracts( state=state, year="2010" )
<- readRDS( "Data/_Latest_dataset/NPO_2014-2020.rds" )
d <- select( d, ID, lat, lon )
d2 <- filter( d2, ! is.na(lat) )
d3
<- st_as_sf( d3, coords=c("lon","lat"), crs=4326 )
d.sf
# get state names - all 50 + DC
<- table( d$State )
tab <- tab[ tab > 600 ]
tab <- names(tab)
states
<- function( state )
get_tracts
{<- tigris::tracts( state=state, year="2010" )
t <- sf::st_transform( t, 4326 )
t return(t)
}
<- NULL
results
for( i in states )
{<- get_tracts( i )
tracts <- sf::st_join( d.sf, tracts, join = st_within, left=FALSE )
df <- dplyr::bind_rows( results, df )
results
}
<- as.data.frame( results )
df <- unique( df )
df <- merge( d, df, by="ID", all.x=TRUE ) dd