We will use the following packages:
library( dplyr )
library( ggplot2 )
library( ggmap )
library( leaflet )
library( tigris )
library( sp )
The following steps are explained in this document.
For this exercise we will use City Bike NYC data from January of 2015. It was obtained from the file posted on AWS, and saved as an RData file.
# Data for Bike Trips for January, 2015
dat <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/bikes.rds")))
dim( dat )
## [1] 285552 16
head( dat )
# Basic Bike Station Info
stations <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/STATIONS.rds")))
dim( stations )
## [1] 330 4
head( stations )
# Data on All Possible Routes (see below)
routes.list <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_LIST.rds")))
routes.list[1]
## $S.72_to_S.72
## m km miles seconds minutes hours leg lon lat
## 1 0 0 0 0 0 0 1 -73.99576 40.76799
## 2 NA NA NA NA NA NA NA -73.99576 40.76799
routes.df <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_DF.rds")))
head( routes.df )
Let’s start with a simple step, visualizing the city. We can do this in multiple ways.
The tigris
package provides nice functions for downloading Census Tiger shapefiles. The limitation of this approach is that it lacks a nice set of orientating features like streets and parks.
# library(tigris)
# library(sp)
ny <- counties( state="NY" )
# New York County: 36061
# Queens: 36081
# Bronx: 36005
# Brooklyn: 36047
# Staten Island: 36085
manhattan.water <- area_water( state="NY", county="061" )
queens.water <- area_water( state="NY", county="081" )
bronx.water <- area_water( state="NY", county="005" )
brooklyn.water <- area_water( state="NY", county="047" )
# staten.water <- area_water( state="NY", county="085" )
nyc <- ny[ ny$GEOID %in% c(36061,36081,36005,36047) , ]
par( mar=c(0,0,0,0), bg="lightblue" )
plot( nyc, border=NA, col="gray95" )
plot( manhattan.water, col="lightblue", border=NA, add=T )
plot( queens.water, col="lightblue", border=NA, add=T )
plot( bronx.water, col="lightblue", border=NA, add=T )
plot( brooklyn.water, col="lightblue", border=NA, add=T )
# plot( staten.water, col="lightblue", border=NA, add=T )
text( as.numeric(nyc$INTPTLON), as.numeric(nyc$INTPTLAT), nyc$NAME, cex=1 )
# stations.url <- "https://github.com/lecy/CityBikeNYC/raw/master/DATA/STATIONS.rds"
# stations <- readRDS(gzcon(url( stations.url )))
# points( stations$LON, stations$LAT, col="darkred", pch=19, cex=0.3 )
The advantage is that Census shapefiles are drawn as polygons, which keeps the maps simple and clean.
nyc <- ny[ ny$GEOID %in% c(36061,36081,36005,36047) , ]
par( mar=c(0,0,0,0), bg="gray" )
plot( nyc, border=NA, col="white", xlim=c(-74.027,-73.94377), ylim=c(40.67611,40.77737 ) )
plot( manhattan.water, col="gray", border=NA, add=T )
plot( queens.water, col="gray", border=NA, add=T )
plot( bronx.water, col="gray", border=NA, add=T )
plot( brooklyn.water, col="gray", border=NA, add=T )
# points( stations$LON, stations$LAT, col="firebrick2", pch=19, cex=0.5 )
Alternatively, the ggmap
package connects with geographic databases like Google Maps to grab map tiles, which are PNG files the provide a backdrop to the plotting window.
The advantage is that these tiles provide a lot of rich contextual detail and are simple to use.
# library( ggmap )
qmap( "New York City, NY", zoom=11 )
We can fine-tune the maps using options in the get_map()
function:
nyc_map <- get_map( location = c(lon = -74.00, lat = 40.71),
maptype = "terrain", col="bw", zoom = 13 )
ggmap( nyc_map, extent="device" )
We can include even more options using the get_googlemap()
function, including turning off text that clutters the map.
map <- get_googlemap( center = 'east village, ny', zoom = 13, col="bw",
style = 'style=feature:all|element:labels|visibility:off' )
ggmap( map, extent="device" )
Leaflet allows us to create dynamic web maps that allow the user to zoom and search the map.
# library( leaflet )
leaflet( ) %>% addTiles() %>% setView(-74.0059, 40.7128, zoom = 13)
We can also fine-tune the aesthetics in Leaflet using myriad free map styles:
# Leaflet 2
leaflet() %>%
addTiles() %>%
setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron")
Starting with the Bike Routes data provided by CityBike on AWS, we can create a database of all of the stations operating in the city at the time.
names( dat )
## [1] "tripduration" "starttime"
## [3] "stoptime" "start.station.id"
## [5] "start.station.name" "start.station.latitude"
## [7] "start.station.longitude" "end.station.id"
## [9] "end.station.name" "end.station.latitude"
## [11] "end.station.longitude" "bikeid"
## [13] "usertype" "birth.year"
## [15] "gender" "ID"
# Pick start station attributes
keep.these.vars <- c( "start.station.id",
"start.station.name",
"start.station.latitude",
"start.station.longitude" )
stations <- unique( dat[ keep.these.vars ] )
nrow(stations) # 330 unique stations
## [1] 330
names(stations) <- c("ID", "StationName", "LAT", "LON") # Rename column titles
stations <- stations[order(stations$ID),] # put station in ID order from 72 till 3002
rownames(stations) <- NULL
head(stations)
# dplyr approach
# Select four columns from "dat" dataframe and identify unique
stations <- dat %>%
select( start.station.id,
start.station.name,
start.station.latitude,
start.station.longitude ) %>%
distinct
We should have 330 unique stations in the city:
nyc <- ny[ ny$GEOID %in% c(36061,36081,36005,36047) , ]
par( mar=c(0,0,0,0), bg="gray" )
plot( nyc, border=NA, col="white", xlim=c(-74.027,-73.94377), ylim=c(40.67611,40.77737 ) )
plot( manhattan.water, col="gray", border=NA, add=T )
plot( queens.water, col="gray", border=NA, add=T )
plot( bronx.water, col="gray", border=NA, add=T )
plot( brooklyn.water, col="gray", border=NA, add=T )
# points( stations$LON, stations$LAT, col="firebrick2", pch=19, cex=0.5 )
points( stations$LON, stations$LAT, col="darkred", pch=19, cex=0.7 )
text( stations$LON, stations$LAT, stations$ID, pos=3, cex=0.6, offset=0.3, col="gray40" )
# library( ggmap )
map <- get_googlemap( center = 'east village, ny', zoom = 13, col="bw",
style = 'style=feature:all|element:labels|visibility:off' )
myplot <- ggmap( map, extent="device" )
myplot + geom_point( aes(x = LON, y = LAT ), colour="darkred", size=1, data = stations )
We will visualize a bike route using the route()
function in ggmap, which builds a set of directions by querying a start and end point through the Google Maps API. Each route will look something like this:
# get the route from Google
rt <- route( from=c(-73.96905,40.75002),
to=c(-73.99148,40.72229),
mode="bicycling",
structure="route" )
rt
# plot the route in ggmap
nyc <- qmap( "New York City, NY", color='bw', zoom=13 )
nyc + geom_path( aes( x = rt$lon , y = rt$lat ),
colour="red", data=rt, alpha=1, size=2 )
There are only 330 stations, which might make this sounds like a simple problem. Unfortunately, there are quite a few unique combinations of these stations. Specifically, there are 330 x 329 = 108,570 unique routes possible.
We will build the routes database using iterated calls to the Google Maps API using a loop structure that looks something like this:
for( i in c("A","B","C") )
{
for( j in c("A","B","C") )
{
print( paste( "Route ID: ", i, "-", j, sep="" ) )
}
}
## [1] "Route ID: A-A"
## [1] "Route ID: A-B"
## [1] "Route ID: A-C"
## [1] "Route ID: B-A"
## [1] "Route ID: B-B"
## [1] "Route ID: B-C"
## [1] "Route ID: C-A"
## [1] "Route ID: C-B"
## [1] "Route ID: C-C"
See the tutoral on Buiding the Routes Database for more info on these steps. The basic script was:
for(i in 1:330)
{
print(paste("LOOP NUMBER", i))
flush.console()
routes <- list()
for(j in 1:330)
{
rt <- try(route(from=c(stations$LON[i], stations$LAT[i]),
to=c(stations$LON[j], stations$LAT[j]),
mode="bicycling",
structure="route"
))
route.name <- paste("S.", stations$ID[i], "_to_S.", stations$ID[j], sep="")
rt <- cbind(rt, from.to=route.name)
routes[[j]] <- rt
names(routes)[j] <- route.name
print(paste("I=", i, "J=", j))
flush.console()
} # end of j loop
id <- substr(1000 + i, 2, 4)
list.name <- paste("RoutesFromStation", id, ".rda", sep="")
save(routes, file=list.name)
}
The script generated routes for all ~108,000 unique station pairs (since there are one-way streets in NYC we did not assume that the route from from point i to point j would be identical to the route from point j to point i). These routes have been saved as a database and are available on GitHub in the DATA folder.
We included a version that stores each route as a separate data frame in a list. The list elements can be referenced by the route name (e.g. “S.71_to_S.398”). We also include a version that has all routes stored as a single data frame.
They can be loaded by:
# Data on All Possible Routes (see below)
routes.list <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_LIST.rds")))
routes.df <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_DF.rds")))
The visualization of CityBike traffic requires that we layer many bike trips onto the base map. We can do this through a loop, or treating each route as a factor level and adding them all together.
# create a bounding box using station locations
max.lat <- max( stations$LAT )
max.lon <- max( stations$LON )
min.lat <- min( stations$LAT )
min.lon <- min( stations$LON )
# uses the list version of routes
par( mar=c(0,0,0,0) )
plot( NA, NA, xlim=c(min.lon,max.lon), ylim=c(min.lat,max.lat), bty="n",
yaxt="n", xaxt="n" )
for( i in 1:length( routes.list ) )
{
# sub.dat <- d2[ ]
lines( routes.list[[i]]$lon, routes.list[[i]]$lat, col="gray" )
}
points( stations$LON, stations$LAT, col="darkred", pch=19, cex=0.7 )
par( mar=c(0,0,0,0), bg="black" )
plot( NA, NA, xlim=c(min.lon,max.lon), ylim=c(min.lat,max.lat) )
for( i in 1:length( routes.list ) )
{
# sub.dat <- d2[ ]
lines( routes.list[[i]]$lon, routes.list[[i]]$lat, col="gray" )
}
points( stations$LON, stations$LAT, col="darkorange2", pch=19, cex=1 )
map <- get_googlemap( center = 'east village, ny', zoom = 13, col="bw",
style = 'style=feature:all|element:labels|visibility:off' )
myplot <- ggmap( map, extent="device" )
# uses the data frame version of routes
myplot <- myplot +
geom_path( aes(x = lon, y = lat, group = factor(route) ),
colour="#1E2B6A", data = routes.df ) +
geom_point( aes(x = LON, y = LAT ),
colour="darkorange2", size=1.5, data = stations )
myplot