library( pander ) # translate output to HTML / latex
library( magrittr ) # use the pipe operator %>%
library( knitr ) # kable function formats tables
x1 <- 1:100
x2 <- -0.1*x1 + rnorm(100)
x3 <- 0.05*x2 + rnorm(100)
y <- 2*x1 + 10*rnorm(100) + 10*x2
dat <- data.frame( y, x1, x2, x3 )
head( dat )
## y x1 x2 x3
## 1 -14.78235 1 -0.9284646 -0.791091985
## 2 11.48104 2 0.9012347 0.009279205
## 3 19.81046 3 0.7431199 0.047636878
## 4 5.97359 4 -0.6265012 0.541796115
## 5 12.28457 5 -0.4747158 -0.686031076
## 6 14.07730 6 -0.3364044 -1.916720680
summary( dat ) %>% kable
y | x1 | x2 | x3 | |
---|---|---|---|---|
Min. :-14.78 | Min. : 1.00 | Min. :-11.9373 | Min. :-2.4884 | |
1st Qu.: 21.90 | 1st Qu.: 25.75 | 1st Qu.: -7.4236 | 1st Qu.:-0.8850 | |
Median : 53.81 | Median : 50.50 | Median : -5.0342 | Median :-0.1701 | |
Mean : 51.14 | Mean : 50.50 | Mean : -4.9769 | Mean :-0.1861 | |
3rd Qu.: 77.40 | 3rd Qu.: 75.25 | 3rd Qu.: -2.3300 | 3rd Qu.: 0.5908 | |
Max. :119.53 | Max. :100.00 | Max. : 0.9012 | Max. : 1.8252 |
library( pastecs ) # convenient descriptives function
stat.desc( dat ) %>% t %>% round(2) %>% pander
nbr.val | nbr.null | nbr.na | min | max | range | sum | |
---|---|---|---|---|---|---|---|
y | 100 | 0 | 0 | -14.78 | 119.5 | 134.3 | 5114 |
x1 | 100 | 0 | 0 | 1 | 100 | 99 | 5050 |
x2 | 100 | 0 | 0 | -11.94 | 0.9 | 12.84 | -497.7 |
x3 | 100 | 0 | 0 | -2.49 | 1.83 | 4.31 | -18.61 |
median | mean | SE.mean | CI.mean.0.95 | var | std.dev | coef.var | |
---|---|---|---|---|---|---|---|
y | 53.81 | 51.14 | 3.33 | 6.61 | 1108 | 33.29 | 0.65 |
x1 | 50.5 | 50.5 | 2.9 | 5.76 | 841.7 | 29.01 | 0.57 |
x2 | -5.03 | -4.98 | 0.3 | 0.6 | 9.1 | 3.02 | -0.61 |
x3 | -0.17 | -0.19 | 0.1 | 0.2 | 1.02 | 1.01 | -5.42 |
stat.desc( dat )[ c(1,4,5,8,9,13), ] %>% t %>% kable( format="markdown", digits=3 )
nbr.val | min | max | median | mean | std.dev | |
---|---|---|---|---|---|---|
y | 100 | -14.782 | 119.532 | 53.805 | 51.140 | 33.291 |
x1 | 100 | 1.000 | 100.000 | 50.500 | 50.500 | 29.011 |
x2 | 100 | -11.937 | 0.901 | -5.034 | -4.977 | 3.017 |
x3 | 100 | -2.488 | 1.825 | -0.170 | -0.186 | 1.008 |
stat.desc( dat )[ c(1,4,5,8,9,13), ] %>% t %>% pander( digits=3 )
nbr.val | min | max | median | mean | std.dev | |
---|---|---|---|---|---|---|
y | 100 | -14.8 | 120 | 53.8 | 51.1 | 33.3 |
x1 | 100 | 1 | 100 | 50.5 | 50.5 | 29 |
x2 | 100 | -11.9 | 0.901 | -5.03 | -4.98 | 3.02 |
x3 | 100 | -2.49 | 1.83 | -0.17 | -0.186 | 1.01 |
descriptives <- t( stat.desc(dat) )
write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )
Convenient visual descriptives:
pairs( dat )
We can improve it:
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
test <- cor.test(x,y)
# borrowed from printCoefmat
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = 1.5 )
text(.7, .8, Signif, cex=cex, col=2)
}
pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)
x <- 1:100
y <- 2*x + rnorm(100,0,20)
plot( x, y )
dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )
The regression is run using the “linear model” command. The basic model will print the minimum output:
lm( y ~ x )
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 0.9454 1.9524
To generate nicely-formatted regression tables save the results from the regression as an object, and format the output for inclusion in a markdown document using the pander
package.
m.01 <- lm( y ~ x )
summary( m.01 ) %>% pander # add pander to format for markdown docs
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 0.9454 | 4.22 | 0.2241 | 0.8232 |
x | 1.952 | 0.07254 | 26.91 | 4.603e-47 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 20.94 | 0.8808 | 0.8796 |
par( mfrow=c(2,2) )
plot( m.01 )
coefficients( m.01 ) %>% pander # model coefficients
(Intercept) | x |
---|---|
0.9454 | 1.952 |
confint( m.01, level=0.95) %>% pander # CIs for model parameters
2.5 % | 97.5 % | |
---|---|---|
(Intercept) | -7.428 | 9.319 |
x | 1.808 | 2.096 |
anova( m.01 ) # anova table
fitted( m.01 ) # predicted values
residuals( m.01 ) # residuals
influence( m.01 ) # regression diagnostics
library( coefplot )
## Loading required package: ggplot2
m.02 <- lm( y ~ x1 + I(x1^2) + x2 + x3 )
coefplot(m.02)
library( memisc )
x_sqr <- x * x
m.01 <- lm( y ~ x )
m.02 <- lm( y ~ x + x_sqr ) # quadratic term
m.03 <- lm( y ~ x - 1 ) # no intercept term
pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
summary.stats=c("R-squared","F","p","N"))
pretty.table %>% pander
Model 1 | Model 2 | Model 3 | |
---|---|---|---|
(Intercept) | 0.945 (4.220) |
5.953 (6.407) |
|
x | 1.952*** (0.073) |
1.658*** (0.293) |
1.967*** (0.036) |
x_sqr | 0.003 (0.003) |
||
R-squared | 0.9 | 0.9 | 1.0 |
F | 724.4 | 363.0 | 3013.1 |
p | 0.0 | 0.0 | 0.0 |
N | 100 | 100 | 100 |
summary( lm( y ~ x1 + x2 + x3 ) ) %>% pander
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 1.168 | 4.257 | 0.2743 | 0.7844 |
x1 | 1.778 | 0.2477 | 7.177 | 1.501e-10 |
x2 | -1.769 | 2.383 | -0.7424 | 0.4596 |
x3 | 1.096 | 2.103 | 0.5209 | 0.6036 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.07 | 0.8818 | 0.8781 |
# add different functional forms
# square x1
summary( lm( y ~ x1 + x1^2 + x2 + x3 ) ) %>% pander # incorrect
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 1.168 | 4.257 | 0.2743 | 0.7844 |
x1 | 1.778 | 0.2477 | 7.177 | 1.501e-10 |
x2 | -1.769 | 2.383 | -0.7424 | 0.4596 |
x3 | 1.096 | 2.103 | 0.5209 | 0.6036 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.07 | 0.8818 | 0.8781 |
summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) ) %>% pander # correct - enclose with I()
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 6.209 | 6.455 | 0.9618 | 0.3386 |
x1 | 1.469 | 0.3867 | 3.799 | 0.0002559 |
I(x1^2) | 0.002943 | 0.002834 | 1.038 | 0.3017 |
x2 | -1.88 | 2.384 | -0.7887 | 0.4322 |
x3 | 0.9677 | 2.106 | 0.4595 | 0.6469 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.06 | 0.8831 | 0.8782 |
summary( lm( y ~ log(x1) + x2 + x3 ) ) %>% pander # log of x1 in formula works fine
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -18.41 | 13.53 | -1.361 | 0.1768 |
log(x1) | 12.13 | 5.434 | 2.232 | 0.02795 |
x2 | -14.89 | 1.672 | -8.91 | 3.267e-14 |
x3 | 1.564 | 2.541 | 0.6155 | 0.5397 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 25.47 | 0.8274 | 0.822 |
# interactions
summary( lm( y ~ x1 + x2 ) ) %>% pander
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 1.017 | 4.231 | 0.2404 | 0.8105 |
x1 | 1.781 | 0.2467 | 7.222 | 1.161e-10 |
x2 | -1.721 | 2.372 | -0.7254 | 0.4699 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 20.99 | 0.8815 | 0.879 |
summary( lm( y ~ x1 + x2 + I(x1*x2) ) ) %>% pander
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 3.251 | 6.078 | 0.535 | 0.5939 |
x1 | 1.714 | 0.2806 | 6.108 | 2.149e-08 |
x2 | -1.06 | 2.706 | -0.3917 | 0.6961 |
I(x1 * x2) | -0.0134 | 0.02608 | -0.5139 | 0.6085 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.07 | 0.8818 | 0.8781 |
summary( lm( y ~ x1*x2 ) ) %>% pander # shortcut to full interaction
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 3.251 | 6.078 | 0.535 | 0.5939 |
x1 | 1.714 | 0.2806 | 6.108 | 2.149e-08 |
x2 | -1.06 | 2.706 | -0.3917 | 0.6961 |
x1:x2 | -0.0134 | 0.02608 | -0.5139 | 0.6085 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.07 | 0.8818 | 0.8781 |
# dummy variables
summary( lm( y ~ x1 + x2 + x3 + dum ) ) %>% pander # drop one level
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -0.08294 | 5.32 | -0.01559 | 0.9876 |
x1 | 1.774 | 0.2599 | 6.827 | 8.741e-10 |
x2 | -1.828 | 2.477 | -0.738 | 0.4624 |
x3 | 1.022 | 2.122 | 0.4815 | 0.6313 |
dumNJ | 3.655 | 5.692 | 0.6421 | 0.5224 |
dumNY | -2.463 | 5.832 | -0.4224 | 0.6737 |
dumPA | 3.882 | 6.442 | 0.6026 | 0.5482 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.24 | 0.8837 | 0.8762 |
summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) %>% pander # keep all, drop intercept
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
x1 | 1.774 | 0.2599 | 6.827 | 8.741e-10 |
x2 | -1.828 | 2.477 | -0.738 | 0.4624 |
x3 | 1.022 | 2.122 | 0.4815 | 0.6313 |
dumMA | -0.08294 | 5.32 | -0.01559 | 0.9876 |
dumNJ | 3.572 | 5.323 | 0.671 | 0.5039 |
dumNY | -2.546 | 5.878 | -0.4331 | 0.6659 |
dumPA | 3.799 | 6.598 | 0.5758 | 0.5661 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 21.24 | 0.969 | 0.9666 |
library( lm.beta )
m.01.beta <- lm.beta( m.01 )
summary( m.01.beta ) # %>% pander
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.24 -11.31 1.63 11.20 76.18
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 0.94541 0.00000 4.21954 0.224 0.823
## x 1.95242 0.93853 0.07254 26.915 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.94 on 98 degrees of freedom
## Multiple R-squared: 0.8808, Adjusted R-squared: 0.8796
## F-statistic: 724.4 on 1 and 98 DF, p-value: < 2.2e-16
# coef( m.01.beta )
# note the standard error is not standardized - describes regular coefficients
summary( m.01 ) %>% pander
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 0.9454 | 4.22 | 0.2241 | 0.8232 |
x | 1.952 | 0.07254 | 26.91 | 4.603e-47 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 20.94 | 0.8808 | 0.8796 |
lm.beta <- function( my.mod )
{
b <- summary(my.mod)$coef[-1, 1]
sx <- sd( my.mod$model[,-1] )
sy <- sd( my.mod$model[,1] )
beta <- b * sx/sy
return(beta)
}
coefficients( m.01 ) %>% pander
(Intercept) | x |
---|---|
0.9454 | 1.952 |
lm.beta( m.01 ) %>% pander
0.9385
# install.packages( "sandwhich" )
# install.packages( "lmtest" )
library(sandwich)
library(lmtest)
m.01 <- lm( y ~ x )
# REGULAR STANDARD ERRORS - not robust
summary( m.01 ) %>% pander
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 0.9454 | 4.22 | 0.2241 | 0.8232 |
x | 1.952 | 0.07254 | 26.91 | 4.603e-47 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
100 | 20.94 | 0.8808 | 0.8796 |
# ROBUST STANDARD ERRORS
# reproduce the Stata default
coeftest( m.01, vcov=vcovHC(m.01,"HC1") ) # robust; HC1 (Stata default)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.945414 4.675088 0.2022 0.8402
## x 1.952417 0.079322 24.6139 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ROBUST STANDARD ERRORS
# check that "sandwich" returns HC0
coeftest(m.01, vcov = sandwich) # robust; sandwich
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.945414 4.628101 0.2043 0.8386
## x 1.952417 0.078525 24.8638 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(m.01, vcov = vcovHC(m.01, "HC0")) # robust; HC0
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.945414 4.628101 0.2043 0.8386
## x 1.952417 0.078525 24.8638 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ROBUST STANDARD ERRORS
# check that the default robust var-cov matrix is HC3
coeftest(m.01, vcov = vcovHC(m.01)) # robust; HC3
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.945414 4.770230 0.1982 0.8433
## x 1.952417 0.080959 24.1162 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(m.01, vcov = vcovHC(m.01, "HC3")) # robust; HC3 (default)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.945414 4.770230 0.1982 0.8433
## x 1.952417 0.080959 24.1162 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1