Setup

We will use the following packages:

library( dplyr )
library( ggplot2 )
library( ggmap )
library( leaflet ) 
library( tigris )
library( sp )

Steps to Visualize Biking Data Routes

The following steps are explained in this document.

  1. Create a list of unique stations.
  2. Create a list of unique pairs of stations.
  3. Create route for each pair of stations.
  4. Create a map of NYC.
  5. Add all routes to the map.

For this exercise we will use City Bike NYC data from January of 2015. It was obtained from the file posted on AWS, and saved as an RData file.

Load Data

# Data for Bike Trips for January, 2015

dat <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/bikes.rds")))

dim( dat )
## [1] 285552     16
head( dat )
# Basic Bike Station Info

stations <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/STATIONS.rds")))

dim( stations )
## [1] 330   4
head( stations )
# Data on All Possible Routes (see below)

routes.list <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_LIST.rds")))

routes.list[1]
## $S.72_to_S.72
##    m km miles seconds minutes hours leg       lon      lat
## 1  0  0     0       0       0     0   1 -73.99576 40.76799
## 2 NA NA    NA      NA      NA    NA  NA -73.99576 40.76799
routes.df <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_DF.rds")))

head( routes.df )

Step 1: Load a Map of NYC

Let’s start with a simple step, visualizing the city. We can do this in multiple ways.

Shapefiles

The tigris package provides nice functions for downloading Census Tiger shapefiles. The limitation of this approach is that it lacks a nice set of orientating features like streets and parks.

# library(tigris)
# library(sp)

ny <- counties( state="NY" )

# New York County: 36061
# Queens: 36081
# Bronx: 36005
# Brooklyn: 36047
# Staten Island: 36085

manhattan.water <- area_water( state="NY", county="061" )
queens.water <- area_water( state="NY", county="081" )
bronx.water <- area_water( state="NY", county="005" )
brooklyn.water <- area_water( state="NY", county="047" )
# staten.water <- area_water( state="NY", county="085" )


nyc <- ny[ ny$GEOID %in% c(36061,36081,36005,36047) , ]




par( mar=c(0,0,0,0), bg="lightblue" )
plot( nyc, border=NA, col="gray95" )
plot( manhattan.water, col="lightblue", border=NA, add=T )
plot( queens.water, col="lightblue", border=NA, add=T )
plot( bronx.water, col="lightblue", border=NA, add=T )
plot( brooklyn.water, col="lightblue", border=NA, add=T )
# plot( staten.water, col="lightblue", border=NA, add=T )

text( as.numeric(nyc$INTPTLON), as.numeric(nyc$INTPTLAT), nyc$NAME, cex=1 )

# stations.url <- "https://github.com/lecy/CityBikeNYC/raw/master/DATA/STATIONS.rds"
# stations <- readRDS(gzcon(url( stations.url )))
# points( stations$LON, stations$LAT, col="darkred", pch=19, cex=0.3 )

The advantage is that Census shapefiles are drawn as polygons, which keeps the maps simple and clean.

nyc <- ny[ ny$GEOID %in% c(36061,36081,36005,36047) , ]

par( mar=c(0,0,0,0), bg="gray" )
plot( nyc, border=NA, col="white", xlim=c(-74.027,-73.94377), ylim=c(40.67611,40.77737 ) )
plot( manhattan.water, col="gray", border=NA, add=T )
plot( queens.water, col="gray", border=NA, add=T )
plot( bronx.water, col="gray", border=NA, add=T )
plot( brooklyn.water, col="gray", border=NA, add=T )

# points( stations$LON, stations$LAT, col="firebrick2", pch=19, cex=0.5 )

ggmap Package

Alternatively, the ggmap package connects with geographic databases like Google Maps to grab map tiles, which are PNG files the provide a backdrop to the plotting window.

The advantage is that these tiles provide a lot of rich contextual detail and are simple to use.

# library( ggmap )

qmap( "New York City, NY",  zoom=11 ) 

We can fine-tune the maps using options in the get_map() function:

nyc_map <- get_map( location = c(lon = -74.00, lat = 40.71), 
                    maptype = "terrain", col="bw", zoom = 13 )

ggmap( nyc_map, extent="device" )

We can include even more options using the get_googlemap() function, including turning off text that clutters the map.

map <- get_googlemap( center = 'east village, ny', zoom = 13, col="bw",
       style = 'style=feature:all|element:labels|visibility:off' )

ggmap( map, extent="device" )

Leaflet Package

Leaflet allows us to create dynamic web maps that allow the user to zoom and search the map.

# library( leaflet )

leaflet( ) %>% addTiles() %>% setView(-74.0059, 40.7128, zoom = 13)

We can also fine-tune the aesthetics in Leaflet using myriad free map styles:

# Leaflet 2
leaflet() %>%
  addTiles() %>%
  setView(-74.00, 40.71, zoom = 12) %>%
  addProviderTiles("CartoDB.Positron")

Step 2: Create a Database of Unique Stations

Starting with the Bike Routes data provided by CityBike on AWS, we can create a database of all of the stations operating in the city at the time.

names( dat )
##  [1] "tripduration"            "starttime"              
##  [3] "stoptime"                "start.station.id"       
##  [5] "start.station.name"      "start.station.latitude" 
##  [7] "start.station.longitude" "end.station.id"         
##  [9] "end.station.name"        "end.station.latitude"   
## [11] "end.station.longitude"   "bikeid"                 
## [13] "usertype"                "birth.year"             
## [15] "gender"                  "ID"
# Pick start station attributes

keep.these.vars <- c( "start.station.id",
                      "start.station.name",
                     "start.station.latitude",
                     "start.station.longitude" )

stations <- unique( dat[ keep.these.vars ] )

nrow(stations) # 330 unique stations
## [1] 330
names(stations) <- c("ID", "StationName", "LAT", "LON") # Rename column titles

stations <- stations[order(stations$ID),] # put station in ID order from 72 till 3002

rownames(stations) <- NULL

head(stations)
# dplyr approach

# Select four columns from "dat" dataframe and identify unique

stations <- dat %>% 
  select( start.station.id, 
          start.station.name,
          start.station.latitude,
          start.station.longitude  ) %>% 
  distinct                      

We should have 330 unique stations in the city:

nyc <- ny[ ny$GEOID %in% c(36061,36081,36005,36047) , ]

par( mar=c(0,0,0,0), bg="gray" )
plot( nyc, border=NA, col="white", xlim=c(-74.027,-73.94377), ylim=c(40.67611,40.77737 ) )
plot( manhattan.water, col="gray", border=NA, add=T )
plot( queens.water, col="gray", border=NA, add=T )
plot( bronx.water, col="gray", border=NA, add=T )
plot( brooklyn.water, col="gray", border=NA, add=T )
# points( stations$LON, stations$LAT, col="firebrick2", pch=19, cex=0.5 )


points( stations$LON, stations$LAT, col="darkred", pch=19, cex=0.7 )

text( stations$LON, stations$LAT, stations$ID, pos=3, cex=0.6, offset=0.3, col="gray40" )

# library( ggmap )

map <- get_googlemap( center = 'east village, ny', zoom = 13, col="bw",
       style = 'style=feature:all|element:labels|visibility:off' )

myplot <- ggmap( map, extent="device" )

myplot + geom_point( aes(x = LON, y = LAT ), colour="darkred", size=1, data = stations )

Step 3: Identify Station Pairs

We will visualize a bike route using the route() function in ggmap, which builds a set of directions by querying a start and end point through the Google Maps API. Each route will look something like this:

# get the route from Google

rt <- route( from=c(-73.96905,40.75002), 
             to=c(-73.99148,40.72229), 
             mode="bicycling",
             structure="route" )

rt
# plot the route in ggmap

nyc <- qmap( "New York City, NY", color='bw', zoom=13 )  

nyc +  geom_path(  aes( x = rt$lon , y = rt$lat ), 
            colour="red", data=rt, alpha=1, size=2 )

There are only 330 stations, which might make this sounds like a simple problem. Unfortunately, there are quite a few unique combinations of these stations. Specifically, there are 330 x 329 = 108,570 unique routes possible.

We will build the routes database using iterated calls to the Google Maps API using a loop structure that looks something like this:

for( i in c("A","B","C") )
{
  
  for( j in c("A","B","C") )
  {
    
    print( paste( "Route ID: ", i, "-", j, sep="" ) )
    
  }
  
}
## [1] "Route ID: A-A"
## [1] "Route ID: A-B"
## [1] "Route ID: A-C"
## [1] "Route ID: B-A"
## [1] "Route ID: B-B"
## [1] "Route ID: B-C"
## [1] "Route ID: C-A"
## [1] "Route ID: C-B"
## [1] "Route ID: C-C"

Step 4: Build the Routes Database

See the tutoral on Buiding the Routes Database for more info on these steps. The basic script was:

for(i in 1:330)
{
  print(paste("LOOP NUMBER", i))
  flush.console()
  routes <- list()
  for(j in 1:330)
  {
    rt <- try(route(from=c(stations$LON[i], stations$LAT[i]), 
                      to=c(stations$LON[j], stations$LAT[j]), 
                      mode="bicycling",
                      structure="route" 
    ))
    route.name <- paste("S.", stations$ID[i], "_to_S.", stations$ID[j], sep="")
    rt <- cbind(rt, from.to=route.name)
    routes[[j]] <- rt
    names(routes)[j] <- route.name
    print(paste("I=", i, "J=", j))
    flush.console()
  }  # end of j loop
  
  id <- substr(1000 + i, 2, 4)
  list.name <- paste("RoutesFromStation", id, ".rda", sep="")
  save(routes, file=list.name)
}

The script generated routes for all ~108,000 unique station pairs (since there are one-way streets in NYC we did not assume that the route from from point i to point j would be identical to the route from point j to point i). These routes have been saved as a database and are available on GitHub in the DATA folder.

We included a version that stores each route as a separate data frame in a list. The list elements can be referenced by the route name (e.g. “S.71_to_S.398”). We also include a version that has all routes stored as a single data frame.

They can be loaded by:

# Data on All Possible Routes (see below)

routes.list <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_LIST.rds")))


routes.df <- readRDS(gzcon(url("https://github.com/lecy/CityBikeNYC/raw/master/DATA/ALL_ROUTES_DF.rds")))

Step 5: Mapping Routes

The visualization of CityBike traffic requires that we layer many bike trips onto the base map. We can do this through a loop, or treating each route as a factor level and adding them all together.

Using Core Plot Functions

White Background

# create a bounding box using station locations

max.lat <- max( stations$LAT )
max.lon <- max( stations$LON )
min.lat <- min( stations$LAT )
min.lon <- min( stations$LON )

# uses the list version of routes

par( mar=c(0,0,0,0) )

plot( NA, NA, xlim=c(min.lon,max.lon), ylim=c(min.lat,max.lat), bty="n", 
      yaxt="n", xaxt="n" )

for( i in 1:length( routes.list ) )
{

   # sub.dat <- d2[ ]

   lines( routes.list[[i]]$lon, routes.list[[i]]$lat, col="gray" )

}

points( stations$LON, stations$LAT, col="darkred", pch=19, cex=0.7 )

Black Background

par( mar=c(0,0,0,0), bg="black" )

plot( NA, NA, xlim=c(min.lon,max.lon), ylim=c(min.lat,max.lat) )


for( i in 1:length( routes.list ) )
{

   # sub.dat <- d2[ ]

   lines( routes.list[[i]]$lon, routes.list[[i]]$lat, col="gray" )

}

points( stations$LON, stations$LAT, col="darkorange2", pch=19, cex=1 )

In ggmap

map <- get_googlemap( center = 'east village, ny', zoom = 13, col="bw",
       style = 'style=feature:all|element:labels|visibility:off' )

myplot <- ggmap( map, extent="device" )

# uses the data frame version of routes

myplot <- myplot + 
          
  geom_path( aes(x = lon, y = lat, group = factor(route) ), 
            colour="#1E2B6A", data = routes.df ) + 
  
  geom_point( aes(x = LON, y = LAT ), 
              colour="darkorange2", size=1.5, data = stations )

myplot