SETUP

load useful packages for formatting output

library( pander ) # translate output to HTML / latex

library(  magrittr )  # use the pipe operator %>%

library( knitr )  # kable function formats tables

create a toy dataset

x1 <- 1:100

x2 <- -0.1*x1 + rnorm(100)

x3 <- 0.05*x2 + rnorm(100)

y <- 2*x1 + 10*rnorm(100) + 10*x2

dat <- data.frame( y, x1, x2, x3 )

head( dat )
##           y x1         x2           x3
## 1 -14.78235  1 -0.9284646 -0.791091985
## 2  11.48104  2  0.9012347  0.009279205
## 3  19.81046  3  0.7431199  0.047636878
## 4   5.97359  4 -0.6265012  0.541796115
## 5  12.28457  5 -0.4747158 -0.686031076
## 6  14.07730  6 -0.3364044 -1.916720680

DESCRIPTIVE STATISTICS

summary function

summary( dat ) %>% kable
y x1 x2 x3
Min. :-14.78 Min. : 1.00 Min. :-11.9373 Min. :-2.4884
1st Qu.: 21.90 1st Qu.: 25.75 1st Qu.: -7.4236 1st Qu.:-0.8850
Median : 53.81 Median : 50.50 Median : -5.0342 Median :-0.1701
Mean : 51.14 Mean : 50.50 Mean : -4.9769 Mean :-0.1861
3rd Qu.: 77.40 3rd Qu.: 75.25 3rd Qu.: -2.3300 3rd Qu.: 0.5908
Max. :119.53 Max. :100.00 Max. : 0.9012 Max. : 1.8252

all descriptive stats

library( pastecs ) # convenient descriptives function

stat.desc( dat ) %>% t %>% round(2) %>% pander
Table continues below
nbr.val nbr.null nbr.na min max range sum
y 100 0 0 -14.78 119.5 134.3 5114
x1 100 0 0 1 100 99 5050
x2 100 0 0 -11.94 0.9 12.84 -497.7
x3 100 0 0 -2.49 1.83 4.31 -18.61
median mean SE.mean CI.mean.0.95 var std.dev coef.var
y 53.81 51.14 3.33 6.61 1108 33.29 0.65
x1 50.5 50.5 2.9 5.76 841.7 29.01 0.57
x2 -5.03 -4.98 0.3 0.6 9.1 3.02 -0.61
x3 -0.17 -0.19 0.1 0.2 1.02 1.01 -5.42

select descriptive stats

stat.desc( dat )[ c(1,4,5,8,9,13), ] %>% t %>% kable( format="markdown", digits=3 )
nbr.val min max median mean std.dev
y 100 -14.782 119.532 53.805 51.140 33.291
x1 100 1.000 100.000 50.500 50.500 29.011
x2 100 -11.937 0.901 -5.034 -4.977 3.017
x3 100 -2.488 1.825 -0.170 -0.186 1.008
stat.desc( dat )[ c(1,4,5,8,9,13), ] %>% t %>% pander( digits=3 )
nbr.val min max median mean std.dev
y 100 -14.8 120 53.8 51.1 33.3
x1 100 1 100 50.5 50.5 29
x2 100 -11.9 0.901 -5.03 -4.98 3.02
x3 100 -2.49 1.83 -0.17 -0.186 1.01

copy and paste a table to excel


descriptives <- t( stat.desc(dat) )
 
write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )

SCATTERPLOTS

pairs plot

Convenient visual descriptives:

pairs( dat )

pretty pairs plot

We can improve it:

panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
    
    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))
    
    text(0.5, 0.5, txt, cex = 1.5 )
    text(.7, .8, Signif, cex=cex, col=2)
}


pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)

REGRESSION SYNTAX

create some data

x <- 1:100
y <- 2*x + rnorm(100,0,20)

plot( x, y )

dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )

basic regression syntax

The regression is run using the “linear model” command. The basic model will print the minimum output:

lm( y ~ x )
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.9454       1.9524

To generate nicely-formatted regression tables save the results from the regression as an object, and format the output for inclusion in a markdown document using the pander package.

m.01 <- lm( y ~ x )

summary( m.01 ) %>% pander  # add pander to format for markdown docs
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9454 4.22 0.2241 0.8232
x 1.952 0.07254 26.91 4.603e-47
Fitting linear model: y ~ x
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 20.94 0.8808 0.8796

nice visual diagnostics of model fit

par( mfrow=c(2,2) )
plot( m.01 )

useful model fit functions

coefficients( m.01 ) %>% pander # model coefficients
(Intercept) x
0.9454 1.952
confint( m.01, level=0.95) %>% pander # CIs for model parameters 
2.5 % 97.5 %
(Intercept) -7.428 9.319
x 1.808 2.096

anova( m.01 )        # anova table 
fitted( m.01 )       # predicted values
residuals( m.01 )    # residuals
influence( m.01 )    # regression diagnostics

coefficient plots

library( coefplot )
## Loading required package: ggplot2
m.02 <-  lm( y ~ x1 + I(x1^2) + x2 + x3 )

coefplot(m.02)

table with multiple regression models

library( memisc )

x_sqr <- x * x

m.01 <- lm( y ~ x )
m.02 <- lm( y ~ x + x_sqr )  # quadratic term
m.03 <- lm( y ~ x - 1 )       # no intercept term


pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
                  summary.stats=c("R-squared","F","p","N"))

pretty.table %>% pander
Model 1 Model 2 Model 3
(Intercept) 0.945
(4.220)
5.953
(6.407)

x 1.952***
(0.073)
1.658***
(0.293)
1.967***
(0.036)
x_sqr
0.003
(0.003)

R-squared 0.9 0.9 1.0
F 724.4 363.0 3013.1
p 0.0 0.0 0.0
N 100 100 100

SPECIFICATION

variable tansformations

summary( lm( y ~ x1 + x2 + x3 ) ) %>% pander
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.168 4.257 0.2743 0.7844
x1 1.778 0.2477 7.177 1.501e-10
x2 -1.769 2.383 -0.7424 0.4596
x3 1.096 2.103 0.5209 0.6036
Fitting linear model: y ~ x1 + x2 + x3
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.07 0.8818 0.8781
# add different functional forms

# square x1

summary( lm( y ~ x1 + x1^2 + x2 + x3 ) )  %>% pander      # incorrect
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.168 4.257 0.2743 0.7844
x1 1.778 0.2477 7.177 1.501e-10
x2 -1.769 2.383 -0.7424 0.4596
x3 1.096 2.103 0.5209 0.6036
Fitting linear model: y ~ x1 + x1^2 + x2 + x3
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.07 0.8818 0.8781
summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) )  %>% pander   # correct - enclose with I()
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.209 6.455 0.9618 0.3386
x1 1.469 0.3867 3.799 0.0002559
I(x1^2) 0.002943 0.002834 1.038 0.3017
x2 -1.88 2.384 -0.7887 0.4322
x3 0.9677 2.106 0.4595 0.6469
Fitting linear model: y ~ x1 + I(x1^2) + x2 + x3
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.06 0.8831 0.8782
summary( lm( y ~ log(x1) + x2 + x3 ) )   %>% pander       # log of x1 in formula works fine
Estimate Std. Error t value Pr(>|t|)
(Intercept) -18.41 13.53 -1.361 0.1768
log(x1) 12.13 5.434 2.232 0.02795
x2 -14.89 1.672 -8.91 3.267e-14
x3 1.564 2.541 0.6155 0.5397
Fitting linear model: y ~ log(x1) + x2 + x3
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 25.47 0.8274 0.822

interactions

# interactions

summary( lm( y ~ x1 + x2 ) ) %>% pander
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.017 4.231 0.2404 0.8105
x1 1.781 0.2467 7.222 1.161e-10
x2 -1.721 2.372 -0.7254 0.4699
Fitting linear model: y ~ x1 + x2
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 20.99 0.8815 0.879
summary( lm( y ~ x1 + x2 + I(x1*x2) ) )  %>% pander
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.251 6.078 0.535 0.5939
x1 1.714 0.2806 6.108 2.149e-08
x2 -1.06 2.706 -0.3917 0.6961
I(x1 * x2) -0.0134 0.02608 -0.5139 0.6085
Fitting linear model: y ~ x1 + x2 + I(x1 * x2)
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.07 0.8818 0.8781
summary( lm( y ~ x1*x2 ) ) %>% pander    # shortcut to full interaction
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.251 6.078 0.535 0.5939
x1 1.714 0.2806 6.108 2.149e-08
x2 -1.06 2.706 -0.3917 0.6961
x1:x2 -0.0134 0.02608 -0.5139 0.6085
Fitting linear model: y ~ x1 * x2
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.07 0.8818 0.8781

dummy variables

# dummy variables

summary( lm( y ~ x1 + x2 + x3 + dum ) ) %>% pander     # drop one level
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.08294 5.32 -0.01559 0.9876
x1 1.774 0.2599 6.827 8.741e-10
x2 -1.828 2.477 -0.738 0.4624
x3 1.022 2.122 0.4815 0.6313
dumNJ 3.655 5.692 0.6421 0.5224
dumNY -2.463 5.832 -0.4224 0.6737
dumPA 3.882 6.442 0.6026 0.5482
Fitting linear model: y ~ x1 + x2 + x3 + dum
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.24 0.8837 0.8762
summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) %>% pander  # keep all, drop intercept
Estimate Std. Error t value Pr(>|t|)
x1 1.774 0.2599 6.827 8.741e-10
x2 -1.828 2.477 -0.738 0.4624
x3 1.022 2.122 0.4815 0.6313
dumMA -0.08294 5.32 -0.01559 0.9876
dumNJ 3.572 5.323 0.671 0.5039
dumNY -2.546 5.878 -0.4331 0.6659
dumPA 3.799 6.598 0.5758 0.5661
Fitting linear model: y ~ x1 + x2 + x3 + dum - 1
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 21.24 0.969 0.9666

ADVANCED

standardized regression coefficients (beta)

library( lm.beta )

m.01.beta <- lm.beta( m.01 )

summary( m.01.beta ) # %>% pander
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54.24 -11.31   1.63  11.20  76.18 
## 
## Coefficients:
##             Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.94541      0.00000    4.21954   0.224    0.823    
## x            1.95242      0.93853    0.07254  26.915   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.94 on 98 degrees of freedom
## Multiple R-squared:  0.8808, Adjusted R-squared:  0.8796 
## F-statistic: 724.4 on 1 and 98 DF,  p-value: < 2.2e-16
# coef( m.01.beta )


# note the standard error is not standardized - describes regular coefficients

summary( m.01 ) %>% pander
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9454 4.22 0.2241 0.8232
x 1.952 0.07254 26.91 4.603e-47
Fitting linear model: y ~ x
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 20.94 0.8808 0.8796

or just use the formula:

lm.beta <- function( my.mod ) 
{
    b <- summary(my.mod)$coef[-1, 1]
    sx <- sd( my.mod$model[,-1] )
    sy <- sd( my.mod$model[,1] )
    beta <- b * sx/sy
    return(beta)
}


coefficients( m.01 ) %>% pander
(Intercept) x
0.9454 1.952
lm.beta( m.01 ) %>% pander

0.9385

robust standard errors

# install.packages( "sandwhich" )
# install.packages( "lmtest" )

library(sandwich)
library(lmtest)

m.01 <- lm( y ~ x )


# REGULAR STANDARD ERRORS - not robust

summary( m.01 ) %>% pander
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9454 4.22 0.2241 0.8232
x 1.952 0.07254 26.91 4.603e-47
Fitting linear model: y ~ x
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
100 20.94 0.8808 0.8796


# ROBUST STANDARD ERRORS

# reproduce the Stata default
coeftest( m.01, vcov=vcovHC(m.01,"HC1") )    # robust; HC1 (Stata default)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.675088  0.2022   0.8402    
## x           1.952417   0.079322 24.6139   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# ROBUST STANDARD ERRORS

# check that "sandwich" returns HC0
coeftest(m.01, vcov = sandwich)                # robust; sandwich
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.628101  0.2043   0.8386    
## x           1.952417   0.078525 24.8638   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(m.01, vcov = vcovHC(m.01, "HC0"))     # robust; HC0 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.628101  0.2043   0.8386    
## x           1.952417   0.078525 24.8638   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# ROBUST STANDARD ERRORS

# check that the default robust var-cov matrix is HC3
coeftest(m.01, vcov = vcovHC(m.01))            # robust; HC3 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.770230  0.1982   0.8433    
## x           1.952417   0.080959 24.1162   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(m.01, vcov = vcovHC(m.01, "HC3"))     # robust; HC3 (default)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.770230  0.1982   0.8433    
## x           1.952417   0.080959 24.1162   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1