Data Wrangling

R Packages

library( haven )        # importing data files 
library( tidyr )        # data wrangling
library( dplyr )        # data wrangling 
library( ggplot2 )      # fancy plots 
library( ggthemes )     # fancy plots
library( scales )       # re-scaling numbers
library( stargazer )    # nice tables 
library( pander )       # format tables for HTML 
library( DT )           # embed datasets in HTML docs

source( "r-functions.R" )

Import Raw Data

###
### BUSINESS MASTER FILES 
###

bmf <- read.csv( "01-data-raw/nccs-bmf-2019.csv" )

# bmf.2019 <- "https://www.dropbox.com/s/bi9ii23bo8l521h/nccs-bmf-2019.csv?dl=1"
# bmf <- read.csv( bmf.2019 )

bmf <- dplyr::select( bmf, EIN, NAME, ADDRESS, CITY, STATE, FIPS,  
               RULEDATE, NTEEFINAL, NTEECC,
               NTEE1, NTMAJ5, NTMAJ10, NTMAJ12, 
               LEVEL1,  LEVEL2, LEVEL3, LEVEL4 )    

bmf$RULEDATE <- substr( bmf$RULEDATE, 1, 4 ) %>% as.numeric()
bmf$RULEDATE[ bmf$RULEDATE < 1900 ] <- NA

bmf$AGE <- 2022 - bmf$RULEDATE 


# Coverage of 2019 BMF file is over 99%
#
# > nrow( core.2019.pc )
# [1] 304441
# > sum( core.2019.pc$ein %in% bmf$EIN )
# [1] 301855



### 
### ADD CENSUS REGIONS
### 

census <- read.csv( "01-data-raw/us-census-bureau-regions-and-divisions.csv" )

# url.census <- "https://raw.githubusercontent.com/lecy/fiscal-health/main/01-data-raw/us-census-bureau-regions-and-divisions.csv"
# census <- read.csv( url.census )

bmf <- merge( bmf, census, by.x="STATE", by.y="State.Code", all.x=TRUE )




###
### CORE PC
###

# COMMON VARIABLES FOR DESCRIPTIVES
# 
# totrevenue - total revenue 
# totfuncexpns - total functional expenses 
# lndbldgsequipend - land, buildings, equipment net assets
# totnetassetend - net assets

# VARIABLES USED FOR METRICS
# 
# totassetsend - total assets end of year
# totliabend - total liabilities end of year

core.2019.pc <- read.csv( "01-data-raw/irs-990-pc-soi-extract-2019.csv" )

# url.soi.pc <- "https://www.dropbox.com/s/88y9sup4d3mwoo5/irs-990-pc-soi-extract-2019.csv?dl=1"
# core.2019.pc <- read.csv( url.soi.pc )

core.2019.pc <- 
  core.2019.pc %>% 
  dplyr::select( ein,
          tax_pd,  
          accntspayableend,
          accntsrcvblend,
          deprcatndepletn,
          grntspayableend,
          invntriesalesend,
          lndbldgsequipend,
          nonintcashend,
          pldgegrntrcvblend,
          prepaidexpnsend,
          svngstempinvend,
          totassetsend,
          totfuncexpns,
          totliabend,
          totnetassetend,
          totprgmrevnue,
          totrevenue,
          unrstrctnetasstsend,
          totcntrbgfts,
          netincfndrsng,
          invstmntinc, 
          txexmptbndsproceeds, 
          netrntlinc, 
          netgnls, 
          royaltsinc, 
          netincsales, 
          miscrevtot11e)  


core.2019.pc$tax.form <- "990PC"

# F9_01_EXP_REV_LESS_EXP_CY
# F9_01_EXP_TOT_CY
# F9_01_REV_TOT_CY
# F9_08_REV_CONTR_GOVT_GRANT
# F9_08_REV_TOT_TOT
# F9_09_EXP_DEPREC_TOT
# F9_09_EXP_INT_TOT
# F9_09_EXP_TOT_MGMT
# F9_09_EXP_TOT_PROG
# F9_09_EXP_TOT_TOT
# F9_09_EXP_TOT_TOT-F9_09_EXP_DEPREC_TOT
# F9_10_ASSET_INV_SALE_EOY
# F9_10_ASSET_LAND_BLDG_NET_EOY
# F9_10_LIAB_MTG_NOTE_EOY
# F9_10_NAFB_UNRESTRICT_EOY
# F9_10_NAFB_UNRESTRICT_BOY (edited) 






###
### CORE EZ 
###

# COMMON VARIABLES FOR DESCRIPTIVES
#
#  totrevnue - total revenue 
#  totexpns - total functional expenses 
#  NA - land, buildings, equipment net assets
#  networthend - net assets

# VARIABLES USED FOR METRICS
#
# totassetsend - total assets end of year
# totliabend - total liabilities end of year

core.2019.ez <- read.csv( "01-data-raw/irs-990-ez-soi-extract-2019.csv" )

# url.soi.ez <- "https://www.dropbox.com/s/2amoe8woboyxpza/irs-990-ez-soi-extract-2019.csv?dl=1"
# core.2019.ez <- read.csv( url.soi.ez )

core.2019.ez <- dplyr::select( core.2019.ez, EIN, tax_pd, 
                               totrevnue, totexpns, networthend,
                               totassetsend, totliabend, 
                               prgmservrev, totcntrbs) 

core.2019.ez <- dplyr::rename( core.2019.ez,
                        ein = EIN,
                        totrevenue = totrevnue,
                        totprgmrevnue = prgmservrev,
                        totfuncexpns = totexpns,
                        totnetassetend = networthend,
                        totassetsend = totassetsend,
                        totliabend = totliabend,
                        totcntrbgfts = totcntrbs )


core.2019.ez$tax.form <- "990EZ"



###
###  STACK PC AND EZ FILES 
###  

core <- dplyr::bind_rows( core.2019.pc, core.2019.ez )

core$taxyr <- substr( core$tax_pd, 1, 4 )




### 
###  ADD BMF META-DATA GEO, NTEE, AND AGE FIELDS
### 

# check for leading zeros problem
# class( core.2019.pc$ein )
# [1] "integer"
# class( bmf$EIN )
# [1] "integer"

core <- merge( core, bmf, by.x="ein", by.y="EIN", all.x=TRUE )




###
### ADD NTEE CODES LABELS 
### 

core$NTMAJ12v1 <- core$NTMAJ12

core$NTMAJ12[ core$NTMAJ12 == "AR" ] <- "Arts"
core$NTMAJ12[ core$NTMAJ12 == "BH" ] <- "Universities"
core$NTMAJ12[ core$NTMAJ12 == "ED" ] <- "Education"
core$NTMAJ12[ core$NTMAJ12 == "EH" ] <- "Hospitals"
core$NTMAJ12[ core$NTMAJ12 == "EN" ] <- "Environmental"
core$NTMAJ12[ core$NTMAJ12 == "HE" ] <- "Health"
core$NTMAJ12[ core$NTMAJ12 == "HU" ] <- "Human Services"
core$NTMAJ12[ core$NTMAJ12 == "IN" ] <- "International"
core$NTMAJ12[ core$NTMAJ12 == "MU" ] <- "Mutual Benefit"
core$NTMAJ12[ core$NTMAJ12 == "PU" ] <- "Public Benefit"
core$NTMAJ12[ core$NTMAJ12 == "RE" ] <- "Religion"
core$NTMAJ12[ core$NTMAJ12 == "UN" ] <- "Unknown"

core$NTMAJ12 <- factor( core$NTMAJ12 )

# table( core$NTMAJ12 )

Filter by Study Sample

###
### STUDY SAMPLE
### 

orgs <- read.csv( "01-data-raw/org-sample.csv" )

# url.orgs <- "https://www.dropbox.com/s/fqk05x2ox3girq7/org-sample.csv?dl=1"
# orgs <- read.csv( url.orgs )

orgs <- dplyr::select( orgs, orgname, ein )

# convert to integer to avoid leading zeros problem 
orgs$ein <- as.numeric( orgs$ein )

# Check how many orgs from the study sample are in the core file?

intersect( orgs$ein, core$ein ) %>% length()
## [1] 5275
setdiff(   orgs$ein, core$ein ) %>% length()
## [1] 647

Use only sample orgs:

core <- dplyr::filter( core, ein %in% orgs$ein )

table( core$NTMAJ12 )
## 
##           Arts      Education  Environmental         Health      Hospitals 
##              0              0              0              0              0 
## Human Services  International Mutual Benefit Public Benefit       Religion 
##           3939              0              0           1585              0 
##   Universities        Unknown 
##              0              0
core <- filter( core, NTEE1 %in% c("L","P","S") )

core$NTEE1[ core$NTEE1 == "L" ] <- "Housing"
core$NTEE1[ core$NTEE1 == "P" ] <- "Human Services"
core$NTEE1[ core$NTEE1 == "S" ] <- "Community Development"

core$NTMAJ12 <- factor( core$NTMAJ12 )
# preview dataset
head(core) %>% pander()
Table continues below
ein tax_pd accntspayableend accntsrcvblend deprcatndepletn
10274725 201906 2350925 578480 454976
10275156 201808 261228 22988 3641
10277678 201809 2401004 0 981384
10281875 201809 345286 420585 39512
10287624 201812 93888 29916 94233
10315296 201812 1253115 62749 720787
Table continues below
grntspayableend invntriesalesend lndbldgsequipend nonintcashend
0 18103 6923356 7515
0 30635 4033 263795
0 0 2601953 8200
0 0 987970 407299
0 16070 1218705 557768
0 0 16108267 3407753
Table continues below
pldgegrntrcvblend prepaidexpnsend svngstempinvend totassetsend
1934929 108390 852294 12834072
504909 14473 0 1325430
2306244 162940 1629257 8321884
0 37371 1619882 9451435
0 0 344339 7993448
0 128312 4043269 52382631
Table continues below
totfuncexpns totliabend totnetassetend totprgmrevnue totrevenue
28907045 7508479 5325593 11036422 29420362
6940837 840264 485166 2692484 6945319
27732197 5582954 2738930 10226964 28303872
3944397 3493051 5958384 2542706 4165924
1846355 318852 7674596 178840 7375709
8869707 34889381 17493250 4852588 8378049
Table continues below
unrstrctnetasstsend totcntrbgfts netincfndrsng invstmntinc
4313496 18056930 -56024 18492
485166 4167243 0 0
2411937 15209801 -463 8786
-979507 1451557 0 0
7016677 7059907 -1122 2180
16501938 3361542 0 163919
Table continues below
txexmptbndsproceeds netrntlinc netgnls royaltsinc netincsales
0 0 364542 0 0
0 0 0 0 0
0 13650 2780040 0 0
0 171661 0 0 0
0 0 29497 0 84288
0 0 0 0 0
Table continues below
miscrevtot11e tax.form taxyr STATE NAME
0 990PC 2019 ME THE OPPORTUNITY ALLIANCE
85592 990PC 2018 ME WESTERN MAINE COMMUNITY ACTION INC
65094 990PC 2018 ME KENNEBEC VALLEY COMMUNITY ACTION PROGRAM
0 990PC 2018 ME EASTERN MAINE DEVELOPMENT CORPORATION
22119 990PC 2018 ME H O M E INC
0 990PC 2018 ME AVESTA HOUSING DEVELOPMENT CORPORATION
Table continues below
ADDRESS CITY FIPS RULEDATE NTEEFINAL NTEECC
50 LYDIA LN SO PORTLAND 23005 1965 P20 P20
PO BOX 200 EAST WILTON NA 1966 P20 P20
97 WATER ST WATERVILLE 23011 1965 P20 P20
40 HARLOW STREET BANGOR 23019 1994 S30 S30
PO BOX 10 ORLAND 23009 1971 P29 P29
307 CUMBERLAND AVE PORTLAND 23005 1976 L22 L22
Table continues below
NTEE1 NTMAJ5 NTMAJ10 NTMAJ12 LEVEL1 LEVEL2
Human Services HU HU Human Services PC O
Human Services HU HU Human Services PC O
Human Services HU HU Human Services PC O
Community Development OT PU Public Benefit PC O
Human Services HU HU Human Services PC O
Housing HU HU Human Services PC O
LEVEL3 LEVEL4 AGE State Region Division NTMAJ12v1
HS P 57 Maine Northeast New England HU
HS P 56 Maine Northeast New England HU
HS P 57 Maine Northeast New England HU
PB S 28 Maine Northeast New England PU
HS P 51 Maine Northeast New England HU
HS L 46 Maine Northeast New England HU

Save Wrangled Data

The output file has 5524 rows and 51 variables:

##  [1] "ein"                 "tax_pd"              "accntspayableend"   
##  [4] "accntsrcvblend"      "deprcatndepletn"     "grntspayableend"    
##  [7] "invntriesalesend"    "lndbldgsequipend"    "nonintcashend"      
## [10] "pldgegrntrcvblend"   "prepaidexpnsend"     "svngstempinvend"    
## [13] "totassetsend"        "totfuncexpns"        "totliabend"         
## [16] "totnetassetend"      "totprgmrevnue"       "totrevenue"         
## [19] "unrstrctnetasstsend" "totcntrbgfts"        "netincfndrsng"      
## [22] "invstmntinc"         "txexmptbndsproceeds" "netrntlinc"         
## [25] "netgnls"             "royaltsinc"          "netincsales"        
## [28] "miscrevtot11e"       "tax.form"            "taxyr"              
## [31] "STATE"               "NAME"                "ADDRESS"            
## [34] "CITY"                "FIPS"                "RULEDATE"           
## [37] "NTEEFINAL"           "NTEECC"              "NTEE1"              
## [40] "NTMAJ5"              "NTMAJ10"             "NTMAJ12"            
## [43] "LEVEL1"              "LEVEL2"              "LEVEL3"             
## [46] "LEVEL4"              "AGE"                 "State"              
## [49] "Region"              "Division"            "NTMAJ12v1"
saveRDS( core, "02-data-wrangled/core.rds" )