library( igraph )
library( pander )

Building a Board Interlock Network

Board interlock is a phenomenon that occurs when multiple organizations share the sample board members. These individuals occupy an important space on the organizational landscape because they mediate information and resources in many cases.

This research vignette presents a method for generating network files from lists of board members using an approximate matching technique.

Exact Matching

If we are working with real data that has not been generated through a very structured process, the data is going to be messy and imprecise.

A primary source of data for nonprofit scholarship, the 990 tax forms, is fully of messy and imprecise data.

For example, if we want to determine whether one individual serves on multiple nonprofit boards, we can do this by matching board member names across organizations.

The problem we will run into is that any small difference in how the name is spelled will result in an false negative match. Take these cases below:

"CHARLES F COLE"  ==  "CHARLES F COLE"
## [1] TRUE
"CHARLES F COLE"  ==  "CHARLES COLE"
## [1] FALSE
"CHARLIE F COLE"  ==  "CHARLES F COLE"
## [1] FALSE
"CHUCK F COLE"  ==  "CHARLES F COLE"
## [1] FALSE
"CHARLES F COLE"  ==  "COLE, CHARLES F"
## [1] FALSE

A human would look at these cases and decide that they are all matches. A computer will look at these cases and decide that only the first one is a match.

Fuzzy Matching

We can overcome this problem using approximate matching, or “fuzzy” matching techniques.

There are a large variety of methods available for this, but the base R package contains a version that uses an edit distance calculation to determine if two words are close.

agrep( "lazy", c("daisy", "lasy", "larry" ), max.distance = 0.1, value = TRUE )
## [1] "lasy"

We can see that this method would give us reasonable matches in most of these cases now. A human would probably recognize that the last case is a match, where the computer would need to know the rule that a comma signifies that name order has been reversed.

agrepl( "CHARLES F COLE", "CHARLES F COLE" )
## [1] TRUE
agrepl( "CHARLES F COLE", "CHARLES COLE" )
## [1] TRUE
agrepl( "CHARLIE F COLE", "CHARLES F COLE" )
## [1] TRUE
agrepl( "CHUCK F COLE",  "CHARLES F COLE" )
## [1] FALSE
agrepl( "CHUCK F BORROWITZSKY",  "CHARLES F BORROWITZSKY" )
## [1] FALSE

The second to last case is a little trickier because one nonprofit is using the full name “Charles” while the other nonprofit is using the nickname “Chuck”. We can deal with this by relaxing the edit distance threshold or changing the cost of specific edits, but any of these changes will need to balance sensitivity and specificity of the matches.

agrepl( "CHUCK F COLE",  "CHARLES F COLE", max.distance = 0.4 )
## [1] TRUE

Consider some of these cases:

BOARD.MEMBER.V1 BOARD.MEMBER.V2 EXACT.MATCH FUZZY.MATCH
EILEEN HALL EILEEN HALL TRUE TRUE
EILEEN HALL EILEEN HANLEY FALSE TRUE
GEORGE HEARST GEORGE R HEARST FALSE TRUE
PAUL F COLE PAUL COLE FALSE TRUE
J THALIA CUNNINGHAM JOSIE THALIA CUNNINGHAM FALSE TRUE
MARY JANE PAULING MARY J PAULING FALSE FALSE
BERT SCHOU ROBERT SCHOFIELD FALSE TRUE
AL OCONNOR E MICHAEL OCONNOR FALSE TRUE
TOM COX TOM CONWAY FALSE TRUE

Load Data

We are going to use board data from a mid-sized US city in 2000. This network data has been generated through an iterative pair-wise comparison of board members across organizations.

Thanks to Nara Yoon for sharing data from her dissertation project for this example.

source( "https://raw.githubusercontent.com/lecy/arnova-2017-workshop/master/workshop/data/net_data_2000.R" )

source( "https://raw.githubusercontent.com/lecy/arnova-2017-workshop/master/workshop/data/attribute_data.R" )

Network from Edgelist

We will use the graph_from_data_frame() function in the igraph package to load our data as a network object. In this example, nodes represent nonprofits and each tie represents a board member. Two nonprofits can share multiple board members, which is reflected by the number of ties between nodes.

We can see that when we connect all of the interlocking board members across the city some interesting structures emerge.

# read in network ties plus attributes

df <- net.2000[ c("org1.ein","org2.ein") ]

net <- graph_from_data_frame( d=df, vertices=att, directed=F ) 



V(net)$size <- 4
V(net)$frame.color <- "white"
V(net)$color <- "orange"
V(net)$label <- "" 
E(net)$arrow.mode <- 0

par( mar=c(0,0,0,0) )
plot( net, layout=layout_nicely(net) )

Subsector Graphs

# scale between 2 and 14 for size

V(net)$size <- 4 + 10*( degree( net, mode="all" ) / max(degree( net, mode="all" )) )
V(net)$NTMAJ5 <- gsub( " ", "", V(net)$NTMAJ5 )

keep.these <- V(net)$NTMAJ5 == "AR"
keep.these[ is.na(keep.these) ] <- FALSE
arts <- V(net)[ keep.these ]
V(net)[ keep.these ]$color <- "orange"
V(net)[ keep.these ]$frame.color <- "orange"
g.ar <- induced_subgraph( graph=net, vids=arts )

keep.these <- V(net)$NTMAJ5 == "HU"
keep.these[ is.na(keep.these) ] <- FALSE
V(net)[ keep.these ]$color <- "steelblue"
V(net)[ keep.these ]$frame.color <- "steelblue"
human.services <- V(net)[ keep.these ]
g.hs <- induced_subgraph( graph=net, vids=human.services )

keep.these <- V(net)$NTMAJ5 == "ED"
keep.these[ is.na(keep.these) ] <- FALSE
V(net)[ keep.these ]$color <- "gray20"
V(net)[ keep.these ]$frame.color <- "gray20"
education <- V(net)[ keep.these ]
g.ed <- induced_subgraph( graph=net, vids=education )

keep.these <- V(net)$NTMAJ5 == "HE"
keep.these[ is.na(keep.these) ] <- FALSE
V(net)[ keep.these ]$color <- "darkred"
V(net)[ keep.these ]$frame.color <- "darkred"
health <- V(net)[ keep.these ]
g.he <- induced_subgraph( graph=net, vids=health )

keep.these <- V(net)$NTMAJ5 == "OT"
keep.these[ is.na(keep.these) ] <- FALSE
other <- V(net)[ keep.these ]
V(net)[ keep.these ]$color <- "gray80"
V(net)[ keep.these ]$frame.color <- "gray80"
g.ot <- induced_subgraph( graph=net, vids=other )



par( mar=c(0,0,2,0), mfrow=c(2,2) )

plot( g.hs, 
      layout=layout_with_kk(g.hs), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Human Services", line=0 )

plot( g.ar, 
      layout=layout_with_kk(g.ar), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Arts", line=0 )

plot( g.ed, 
      layout=layout_with_kk(g.ed), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Education", line=0 )

plot( g.he, 
      layout=layout_with_kk(g.he), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Health", line=0 )

Plot Separately

plot( g.hs, 
      layout=layout_with_kk(g.hs), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Human Services", line=0 )

plot( g.ar, 
      layout=layout_with_kk(g.ar), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Arts", line=0 )

plot( g.ed, 
      layout=layout_with_kk(g.ed), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Education", line=0 )

plot( g.he, 
      layout=layout_with_kk(g.he), 
      edge.color="gray50", 
      vertex.label=NA )
title( main="Health", line=0 )

Show Only the Core

keep.these <- V(net)[ degree( net, mode="all" ) > 25 ]

g.sub <- induced_subgraph( graph=net, vids=keep.these )

par( mar=c(0,0,0,0) )
plot( g.sub, layout=layout_with_kk(g.sub), 
      edge.color="gray90", 
      vertex.label=V(g.sub)$NAME, vertex.label.cex=0.5, vertex.label.dist=1, vertex.label.color="darkgray",
      vertex.size=15*(degree(g.sub)/max(degree(g.sub)) ) )

Scale Size of Nodes by AGE

V(net)$AGE[ is.na( V(net)$AGE ) ] <- median( V(net)$AGE, na.rm=T )

V(net)$size <- 1 + 10*( V(net)$AGE / max( V(net)$AGE ) )
V(net)$color <- "orange"

par( mar=c(0,0,0,0) )
plot( net, layout=layout_nicely(net) )