library( haven ) # importing data files
library( tidyr ) # data wrangling
library( dplyr ) # data wrangling
library( ggplot2 ) # fancy plots
library( ggthemes ) # fancy plots
library( scales ) # re-scaling numbers
library( stargazer ) # nice tables
library( pander ) # format tables for HTML
library( DT ) # embed datasets in HTML docs
source( "r-functions.R" )
###
### BUSINESS MASTER FILES
###
<- read.csv( "01-data-raw/nccs-bmf-2019.csv" )
bmf
# bmf.2019 <- "https://www.dropbox.com/s/bi9ii23bo8l521h/nccs-bmf-2019.csv?dl=1"
# bmf <- read.csv( bmf.2019 )
<- dplyr::select( bmf, EIN, NAME, ADDRESS, CITY, STATE, FIPS,
bmf
RULEDATE, NTEEFINAL, NTEECC,
NTEE1, NTMAJ5, NTMAJ10, NTMAJ12,
LEVEL1, LEVEL2, LEVEL3, LEVEL4 )
$RULEDATE <- substr( bmf$RULEDATE, 1, 4 ) %>% as.numeric()
bmf$RULEDATE[ bmf$RULEDATE < 1900 ] <- NA
bmf
$AGE <- 2022 - bmf$RULEDATE
bmf
# Coverage of 2019 BMF file is over 99%
#
# > nrow( core.2019.pc )
# [1] 304441
# > sum( core.2019.pc$ein %in% bmf$EIN )
# [1] 301855
###
### ADD CENSUS REGIONS
###
<- read.csv( "01-data-raw/us-census-bureau-regions-and-divisions.csv" )
census
# url.census <- "https://raw.githubusercontent.com/lecy/fiscal-health/main/01-data-raw/us-census-bureau-regions-and-divisions.csv"
# census <- read.csv( url.census )
<- merge( bmf, census, by.x="STATE", by.y="State.Code", all.x=TRUE )
bmf
###
### CORE PC
###
# COMMON VARIABLES FOR DESCRIPTIVES
#
# totrevenue - total revenue
# totfuncexpns - total functional expenses
# lndbldgsequipend - land, buildings, equipment net assets
# totnetassetend - net assets
# VARIABLES USED FOR METRICS
#
# totassetsend - total assets end of year
# totliabend - total liabilities end of year
2019.pc <- read.csv( "01-data-raw/irs-990-pc-soi-extract-2019.csv" )
core.
# url.soi.pc <- "https://www.dropbox.com/s/88y9sup4d3mwoo5/irs-990-pc-soi-extract-2019.csv?dl=1"
# core.2019.pc <- read.csv( url.soi.pc )
2019.pc <-
core.2019.pc %>%
core.::select( ein,
dplyr
tax_pd,
accntspayableend,
accntsrcvblend,
deprcatndepletn,
grntspayableend,
invntriesalesend,
lndbldgsequipend,
nonintcashend,
pldgegrntrcvblend,
prepaidexpnsend,
svngstempinvend,
totassetsend,
totfuncexpns,
totliabend,
totnetassetend,
totprgmrevnue,
totrevenue,
unrstrctnetasstsend,
totcntrbgfts,
netincfndrsng,
invstmntinc,
txexmptbndsproceeds,
netrntlinc,
netgnls,
royaltsinc,
netincsales,
miscrevtot11e)
2019.pc$tax.form <- "990PC"
core.
# F9_01_EXP_REV_LESS_EXP_CY
# F9_01_EXP_TOT_CY
# F9_01_REV_TOT_CY
# F9_08_REV_CONTR_GOVT_GRANT
# F9_08_REV_TOT_TOT
# F9_09_EXP_DEPREC_TOT
# F9_09_EXP_INT_TOT
# F9_09_EXP_TOT_MGMT
# F9_09_EXP_TOT_PROG
# F9_09_EXP_TOT_TOT
# F9_09_EXP_TOT_TOT-F9_09_EXP_DEPREC_TOT
# F9_10_ASSET_INV_SALE_EOY
# F9_10_ASSET_LAND_BLDG_NET_EOY
# F9_10_LIAB_MTG_NOTE_EOY
# F9_10_NAFB_UNRESTRICT_EOY
# F9_10_NAFB_UNRESTRICT_BOY (edited)
###
### CORE EZ
###
# COMMON VARIABLES FOR DESCRIPTIVES
#
# totrevnue - total revenue
# totexpns - total functional expenses
# NA - land, buildings, equipment net assets
# networthend - net assets
# VARIABLES USED FOR METRICS
#
# totassetsend - total assets end of year
# totliabend - total liabilities end of year
2019.ez <- read.csv( "01-data-raw/irs-990-ez-soi-extract-2019.csv" )
core.
# url.soi.ez <- "https://www.dropbox.com/s/2amoe8woboyxpza/irs-990-ez-soi-extract-2019.csv?dl=1"
# core.2019.ez <- read.csv( url.soi.ez )
2019.ez <- dplyr::select( core.2019.ez, EIN, tax_pd,
core.
totrevnue, totexpns, networthend,
totassetsend, totliabend,
prgmservrev, totcntrbs)
2019.ez <- dplyr::rename( core.2019.ez,
core.ein = EIN,
totrevenue = totrevnue,
totprgmrevnue = prgmservrev,
totfuncexpns = totexpns,
totnetassetend = networthend,
totassetsend = totassetsend,
totliabend = totliabend,
totcntrbgfts = totcntrbs )
2019.ez$tax.form <- "990EZ"
core.
###
### STACK PC AND EZ FILES
###
<- dplyr::bind_rows( core.2019.pc, core.2019.ez )
core
$taxyr <- substr( core$tax_pd, 1, 4 )
core
###
### ADD BMF META-DATA GEO, NTEE, AND AGE FIELDS
###
# check for leading zeros problem
# class( core.2019.pc$ein )
# [1] "integer"
# class( bmf$EIN )
# [1] "integer"
<- merge( core, bmf, by.x="ein", by.y="EIN", all.x=TRUE )
core
###
### ADD NTEE CODES LABELS
###
$NTMAJ12v1 <- core$NTMAJ12
core
$NTMAJ12[ core$NTMAJ12 == "AR" ] <- "Arts"
core$NTMAJ12[ core$NTMAJ12 == "BH" ] <- "Universities"
core$NTMAJ12[ core$NTMAJ12 == "ED" ] <- "Education"
core$NTMAJ12[ core$NTMAJ12 == "EH" ] <- "Hospitals"
core$NTMAJ12[ core$NTMAJ12 == "EN" ] <- "Environmental"
core$NTMAJ12[ core$NTMAJ12 == "HE" ] <- "Health"
core$NTMAJ12[ core$NTMAJ12 == "HU" ] <- "Human Services"
core$NTMAJ12[ core$NTMAJ12 == "IN" ] <- "International"
core$NTMAJ12[ core$NTMAJ12 == "MU" ] <- "Mutual Benefit"
core$NTMAJ12[ core$NTMAJ12 == "PU" ] <- "Public Benefit"
core$NTMAJ12[ core$NTMAJ12 == "RE" ] <- "Religion"
core$NTMAJ12[ core$NTMAJ12 == "UN" ] <- "Unknown"
core
$NTMAJ12 <- factor( core$NTMAJ12 )
core
# table( core$NTMAJ12 )
###
### STUDY SAMPLE
###
<- read.csv( "01-data-raw/org-sample.csv" )
orgs
# url.orgs <- "https://www.dropbox.com/s/fqk05x2ox3girq7/org-sample.csv?dl=1"
# orgs <- read.csv( url.orgs )
<- dplyr::select( orgs, orgname, ein )
orgs
# convert to integer to avoid leading zeros problem
$ein <- as.numeric( orgs$ein )
orgs
# Check how many orgs from the study sample are in the core file?
intersect( orgs$ein, core$ein ) %>% length()
## [1] 5275
setdiff( orgs$ein, core$ein ) %>% length()
## [1] 647
Use only sample orgs:
<- dplyr::filter( core, ein %in% orgs$ein )
core
table( core$NTMAJ12 )
##
## Arts Education Environmental Health Hospitals
## 0 0 0 0 0
## Human Services International Mutual Benefit Public Benefit Religion
## 3939 0 0 1585 0
## Universities Unknown
## 0 0
<- filter( core, NTEE1 %in% c("L","P","S") )
core
$NTEE1[ core$NTEE1 == "L" ] <- "Housing"
core$NTEE1[ core$NTEE1 == "P" ] <- "Human Services"
core$NTEE1[ core$NTEE1 == "S" ] <- "Community Development"
core
$NTMAJ12 <- factor( core$NTMAJ12 ) core
# preview dataset
head(core) %>% pander()
ein | tax_pd | accntspayableend | accntsrcvblend | deprcatndepletn |
---|---|---|---|---|
10274725 | 201906 | 2350925 | 578480 | 454976 |
10275156 | 201808 | 261228 | 22988 | 3641 |
10277678 | 201809 | 2401004 | 0 | 981384 |
10281875 | 201809 | 345286 | 420585 | 39512 |
10287624 | 201812 | 93888 | 29916 | 94233 |
10315296 | 201812 | 1253115 | 62749 | 720787 |
grntspayableend | invntriesalesend | lndbldgsequipend | nonintcashend |
---|---|---|---|
0 | 18103 | 6923356 | 7515 |
0 | 30635 | 4033 | 263795 |
0 | 0 | 2601953 | 8200 |
0 | 0 | 987970 | 407299 |
0 | 16070 | 1218705 | 557768 |
0 | 0 | 16108267 | 3407753 |
pldgegrntrcvblend | prepaidexpnsend | svngstempinvend | totassetsend |
---|---|---|---|
1934929 | 108390 | 852294 | 12834072 |
504909 | 14473 | 0 | 1325430 |
2306244 | 162940 | 1629257 | 8321884 |
0 | 37371 | 1619882 | 9451435 |
0 | 0 | 344339 | 7993448 |
0 | 128312 | 4043269 | 52382631 |
totfuncexpns | totliabend | totnetassetend | totprgmrevnue | totrevenue |
---|---|---|---|---|
28907045 | 7508479 | 5325593 | 11036422 | 29420362 |
6940837 | 840264 | 485166 | 2692484 | 6945319 |
27732197 | 5582954 | 2738930 | 10226964 | 28303872 |
3944397 | 3493051 | 5958384 | 2542706 | 4165924 |
1846355 | 318852 | 7674596 | 178840 | 7375709 |
8869707 | 34889381 | 17493250 | 4852588 | 8378049 |
unrstrctnetasstsend | totcntrbgfts | netincfndrsng | invstmntinc |
---|---|---|---|
4313496 | 18056930 | -56024 | 18492 |
485166 | 4167243 | 0 | 0 |
2411937 | 15209801 | -463 | 8786 |
-979507 | 1451557 | 0 | 0 |
7016677 | 7059907 | -1122 | 2180 |
16501938 | 3361542 | 0 | 163919 |
txexmptbndsproceeds | netrntlinc | netgnls | royaltsinc | netincsales |
---|---|---|---|---|
0 | 0 | 364542 | 0 | 0 |
0 | 0 | 0 | 0 | 0 |
0 | 13650 | 2780040 | 0 | 0 |
0 | 171661 | 0 | 0 | 0 |
0 | 0 | 29497 | 0 | 84288 |
0 | 0 | 0 | 0 | 0 |
miscrevtot11e | tax.form | taxyr | STATE | NAME |
---|---|---|---|---|
0 | 990PC | 2019 | ME | THE OPPORTUNITY ALLIANCE |
85592 | 990PC | 2018 | ME | WESTERN MAINE COMMUNITY ACTION INC |
65094 | 990PC | 2018 | ME | KENNEBEC VALLEY COMMUNITY ACTION PROGRAM |
0 | 990PC | 2018 | ME | EASTERN MAINE DEVELOPMENT CORPORATION |
22119 | 990PC | 2018 | ME | H O M E INC |
0 | 990PC | 2018 | ME | AVESTA HOUSING DEVELOPMENT CORPORATION |
ADDRESS | CITY | FIPS | RULEDATE | NTEEFINAL | NTEECC |
---|---|---|---|---|---|
50 LYDIA LN | SO PORTLAND | 23005 | 1965 | P20 | P20 |
PO BOX 200 | EAST WILTON | NA | 1966 | P20 | P20 |
97 WATER ST | WATERVILLE | 23011 | 1965 | P20 | P20 |
40 HARLOW STREET | BANGOR | 23019 | 1994 | S30 | S30 |
PO BOX 10 | ORLAND | 23009 | 1971 | P29 | P29 |
307 CUMBERLAND AVE | PORTLAND | 23005 | 1976 | L22 | L22 |
NTEE1 | NTMAJ5 | NTMAJ10 | NTMAJ12 | LEVEL1 | LEVEL2 |
---|---|---|---|---|---|
Human Services | HU | HU | Human Services | PC | O |
Human Services | HU | HU | Human Services | PC | O |
Human Services | HU | HU | Human Services | PC | O |
Community Development | OT | PU | Public Benefit | PC | O |
Human Services | HU | HU | Human Services | PC | O |
Housing | HU | HU | Human Services | PC | O |
LEVEL3 | LEVEL4 | AGE | State | Region | Division | NTMAJ12v1 |
---|---|---|---|---|---|---|
HS | P | 57 | Maine | Northeast | New England | HU |
HS | P | 56 | Maine | Northeast | New England | HU |
HS | P | 57 | Maine | Northeast | New England | HU |
PB | S | 28 | Maine | Northeast | New England | PU |
HS | P | 51 | Maine | Northeast | New England | HU |
HS | L | 46 | Maine | Northeast | New England | HU |
The output file has 5524 rows and 51 variables:
## [1] "ein" "tax_pd" "accntspayableend"
## [4] "accntsrcvblend" "deprcatndepletn" "grntspayableend"
## [7] "invntriesalesend" "lndbldgsequipend" "nonintcashend"
## [10] "pldgegrntrcvblend" "prepaidexpnsend" "svngstempinvend"
## [13] "totassetsend" "totfuncexpns" "totliabend"
## [16] "totnetassetend" "totprgmrevnue" "totrevenue"
## [19] "unrstrctnetasstsend" "totcntrbgfts" "netincfndrsng"
## [22] "invstmntinc" "txexmptbndsproceeds" "netrntlinc"
## [25] "netgnls" "royaltsinc" "netincsales"
## [28] "miscrevtot11e" "tax.form" "taxyr"
## [31] "STATE" "NAME" "ADDRESS"
## [34] "CITY" "FIPS" "RULEDATE"
## [37] "NTEEFINAL" "NTEECC" "NTEE1"
## [40] "NTMAJ5" "NTMAJ10" "NTMAJ12"
## [43] "LEVEL1" "LEVEL2" "LEVEL3"
## [46] "LEVEL4" "AGE" "State"
## [49] "Region" "Division" "NTMAJ12v1"
saveRDS( core, "02-data-wrangled/core.rds" )