SETUP

load useful packages for formatting output

library( pander ) # translate output to HTML / latex

library(  magrittr )  # use the pipe operator %>%

library( knitr )  # kable function formats tables

create a toy dataset

x1 <- 1:100

x2 <- -0.1*x1 + rnorm(100)

x3 <- 0.05*x2 + rnorm(100)

y <- 2*x1 + 10*rnorm(100) + 10*x2

dat <- data.frame( y, x1, x2, x3 )

head( dat )

##           y x1         x2           x3
## 1 -14.78235  1 -0.9284646 -0.791091985
## 2  11.48104  2  0.9012347  0.009279205
## 3  19.81046  3  0.7431199  0.047636878
## 4   5.97359  4 -0.6265012  0.541796115
## 5  12.28457  5 -0.4747158 -0.686031076
## 6  14.07730  6 -0.3364044 -1.916720680

DESCRIPTIVE STATISTICS

summary function

summary( dat ) %>% kable

y	x1	x2	x3
Min. :-14.78	Min. : 1.00	Min. :-11.9373	Min. :-2.4884
1st Qu.: 21.90	1st Qu.: 25.75	1st Qu.: -7.4236	1st Qu.:-0.8850
Median : 53.81	Median : 50.50	Median : -5.0342	Median :-0.1701
Mean : 51.14	Mean : 50.50	Mean : -4.9769	Mean :-0.1861
3rd Qu.: 77.40	3rd Qu.: 75.25	3rd Qu.: -2.3300	3rd Qu.: 0.5908
Max. :119.53	Max. :100.00	Max. : 0.9012	Max. : 1.8252

all descriptive stats

library( pastecs ) # convenient descriptives function

stat.desc( dat ) %>% t %>% round(2) %>% pander

Table continues below
	nbr.val	min	max	range	sum
y	100	-14.78	119.5	134.3	5114
x1	100	1	100	99	5050
x2	100	-11.94	0.9	12.84	-497.7
x3	100	-2.49	1.83	4.31	-18.61

	median	mean	SE.mean	CI.mean.0.95	var	std.dev	coef.var
y	53.81	51.14	3.33	6.61	1108	33.29	0.65
x1	50.5	50.5	2.9	5.76	841.7	29.01	0.57
x2	-5.03	-4.98	0.3	0.6	9.1	3.02	-0.61
x3	-0.17	-0.19	0.1	0.2	1.02	1.01	-5.42

select descriptive stats

stat.desc( dat )[ c(1,4,5,8,9,13), ] %>% t %>% kable( format="markdown", digits=3 )

	nbr.val	min	max	median	mean	std.dev
y	100	-14.782	119.532	53.805	51.140	33.291
x1	100	1.000	100.000	50.500	50.500	29.011
x2	100	-11.937	0.901	-5.034	-4.977	3.017
x3	100	-2.488	1.825	-0.170	-0.186	1.008

stat.desc( dat )[ c(1,4,5,8,9,13), ] %>% t %>% pander( digits=3 )

	nbr.val	min	max	median	mean	std.dev
y	100	-14.8	120	53.8	51.1	33.3
x1	100	1	100	50.5	50.5	29
x2	100	-11.9	0.901	-5.03	-4.98	3.02
x3	100	-2.49	1.83	-0.17	-0.186	1.01

copy and paste a table to excel


descriptives <- t( stat.desc(dat) )
 
write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )

SCATTERPLOTS

pairs plot

Convenient visual descriptives:

pairs( dat )

pretty pairs plot

We can improve it:

panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
    
    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))
    
    text(0.5, 0.5, txt, cex = 1.5 )
    text(.7, .8, Signif, cex=cex, col=2)
}


pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)

REGRESSION SYNTAX

create some data

x <- 1:100
y <- 2*x + rnorm(100,0,20)

plot( x, y )

dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )

basic regression syntax

The regression is run using the “linear model” command. The basic model will print the minimum output:

lm( y ~ x )

## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.9454       1.9524

To generate nicely-formatted regression tables save the results from the regression as an object, and format the output for inclusion in a markdown document using the pander package.

m.01 <- lm( y ~ x )

summary( m.01 ) %>% pander  # add pander to format for markdown docs

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	0.9454	4.22	0.2241	0.8232
x	1.952	0.07254	26.91	4.603e-47

Fitting linear model: y ~ x
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	20.94	0.8808	0.8796

nice visual diagnostics of model fit

par( mfrow=c(2,2) )
plot( m.01 )

useful model fit functions

coefficients( m.01 ) %>% pander # model coefficients

(Intercept)	x
0.9454	1.952

confint( m.01, level=0.95) %>% pander # CIs for model parameters

	2.5 %	97.5 %
(Intercept)	-7.428	9.319
x	1.808	2.096

anova( m.01 )        # anova table 
fitted( m.01 )       # predicted values
residuals( m.01 )    # residuals
influence( m.01 )    # regression diagnostics

coefficient plots

library( coefplot )

## Loading required package: ggplot2

m.02 <-  lm( y ~ x1 + I(x1^2) + x2 + x3 )

coefplot(m.02)

table with multiple regression models

library( memisc )

x_sqr <- x * x

m.01 <- lm( y ~ x )
m.02 <- lm( y ~ x + x_sqr )  # quadratic term
m.03 <- lm( y ~ x - 1 )       # no intercept term


pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
                  summary.stats=c("R-squared","F","p","N"))

pretty.table %>% pander

	Model 1	Model 2	Model 3
(Intercept)	0.945 (4.220)	5.953 (6.407)
x	1.952*** (0.073)	1.658*** (0.293)	1.967*** (0.036)
x_sqr		0.003 (0.003)
R-squared	0.9	0.9	1.0
F	724.4	363.0	3013.1
p	0.0	0.0	0.0
N	100	100	100

SPECIFICATION

variable tansformations

summary( lm( y ~ x1 + x2 + x3 ) ) %>% pander

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	1.168	4.257	0.2743	0.7844
x1	1.778	0.2477	7.177	1.501e-10
x2	-1.769	2.383	-0.7424	0.4596
x3	1.096	2.103	0.5209	0.6036

Fitting linear model: y ~ x1 + x2 + x3
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.07	0.8818	0.8781

# add different functional forms

# square x1

summary( lm( y ~ x1 + x1^2 + x2 + x3 ) )  %>% pander      # incorrect

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	1.168	4.257	0.2743	0.7844
x1	1.778	0.2477	7.177	1.501e-10
x2	-1.769	2.383	-0.7424	0.4596
x3	1.096	2.103	0.5209	0.6036

Fitting linear model: y ~ x1 + x1^2 + x2 + x3
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.07	0.8818	0.8781

summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) )  %>% pander   # correct - enclose with I()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	6.209	6.455	0.9618	0.3386
x1	1.469	0.3867	3.799	0.0002559
I(x1^2)	0.002943	0.002834	1.038	0.3017
x2	-1.88	2.384	-0.7887	0.4322
x3	0.9677	2.106	0.4595	0.6469

Fitting linear model: y ~ x1 + I(x1^2) + x2 + x3
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.06	0.8831	0.8782

summary( lm( y ~ log(x1) + x2 + x3 ) )   %>% pander       # log of x1 in formula works fine

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-18.41	13.53	-1.361	0.1768
log(x1)	12.13	5.434	2.232	0.02795
x2	-14.89	1.672	-8.91	3.267e-14
x3	1.564	2.541	0.6155	0.5397

Fitting linear model: y ~ log(x1) + x2 + x3
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	25.47	0.8274	0.822

interactions

# interactions

summary( lm( y ~ x1 + x2 ) ) %>% pander

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	1.017	4.231	0.2404	0.8105
x1	1.781	0.2467	7.222	1.161e-10
x2	-1.721	2.372	-0.7254	0.4699

Fitting linear model: y ~ x1 + x2
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	20.99	0.8815	0.879

summary( lm( y ~ x1 + x2 + I(x1*x2) ) )  %>% pander

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	3.251	6.078	0.535	0.5939
x1	1.714	0.2806	6.108	2.149e-08
x2	-1.06	2.706	-0.3917	0.6961
*I(x1 x2)**	-0.0134	0.02608	-0.5139	0.6085

Fitting linear model: y ~ x1 + x2 + I(x1 * x2)
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.07	0.8818	0.8781

summary( lm( y ~ x1*x2 ) ) %>% pander    # shortcut to full interaction

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	3.251	6.078	0.535	0.5939
x1	1.714	0.2806	6.108	2.149e-08
x2	-1.06	2.706	-0.3917	0.6961
x1:x2	-0.0134	0.02608	-0.5139	0.6085

Fitting linear model: y ~ x1 * x2
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.07	0.8818	0.8781

dummy variables

# dummy variables

summary( lm( y ~ x1 + x2 + x3 + dum ) ) %>% pander     # drop one level

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-0.08294	5.32	-0.01559	0.9876
x1	1.774	0.2599	6.827	8.741e-10
x2	-1.828	2.477	-0.738	0.4624
x3	1.022	2.122	0.4815	0.6313
dumNJ	3.655	5.692	0.6421	0.5224
dumNY	-2.463	5.832	-0.4224	0.6737
dumPA	3.882	6.442	0.6026	0.5482

Fitting linear model: y ~ x1 + x2 + x3 + dum
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.24	0.8837	0.8762

summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) %>% pander  # keep all, drop intercept

	Estimate	Std. Error	t value	Pr(>\|t\|)
x1	1.774	0.2599	6.827	8.741e-10
x2	-1.828	2.477	-0.738	0.4624
x3	1.022	2.122	0.4815	0.6313
dumMA	-0.08294	5.32	-0.01559	0.9876
dumNJ	3.572	5.323	0.671	0.5039
dumNY	-2.546	5.878	-0.4331	0.6659
dumPA	3.799	6.598	0.5758	0.5661

Fitting linear model: y ~ x1 + x2 + x3 + dum - 1
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	21.24	0.969	0.9666

ADVANCED

standardized regression coefficients (beta)

library( lm.beta )

m.01.beta <- lm.beta( m.01 )

summary( m.01.beta ) # %>% pander

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54.24 -11.31   1.63  11.20  76.18 
## 
## Coefficients:
##             Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.94541      0.00000    4.21954   0.224    0.823    
## x            1.95242      0.93853    0.07254  26.915   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.94 on 98 degrees of freedom
## Multiple R-squared:  0.8808, Adjusted R-squared:  0.8796 
## F-statistic: 724.4 on 1 and 98 DF,  p-value: < 2.2e-16

# coef( m.01.beta )

# note the standard error is not standardized - describes regular coefficients

summary( m.01 ) %>% pander

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	0.9454	4.22	0.2241	0.8232
x	1.952	0.07254	26.91	4.603e-47

Fitting linear model: y ~ x
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	20.94	0.8808	0.8796

or just use the formula:

lm.beta <- function( my.mod ) 
{
    b <- summary(my.mod)$coef[-1, 1]
    sx <- sd( my.mod$model[,-1] )
    sy <- sd( my.mod$model[,1] )
    beta <- b * sx/sy
    return(beta)
}


coefficients( m.01 ) %>% pander

(Intercept)	x
0.9454	1.952

lm.beta( m.01 ) %>% pander

0.9385

robust standard errors

# install.packages( "sandwhich" )
# install.packages( "lmtest" )

library(sandwich)
library(lmtest)

m.01 <- lm( y ~ x )


# REGULAR STANDARD ERRORS - not robust

summary( m.01 ) %>% pander

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	0.9454	4.22	0.2241	0.8232
x	1.952	0.07254	26.91	4.603e-47

Fitting linear model: y ~ x
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
100	20.94	0.8808	0.8796

# ROBUST STANDARD ERRORS

# reproduce the Stata default
coeftest( m.01, vcov=vcovHC(m.01,"HC1") )    # robust; HC1 (Stata default)

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.675088  0.2022   0.8402    
## x           1.952417   0.079322 24.6139   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ROBUST STANDARD ERRORS

# check that "sandwich" returns HC0
coeftest(m.01, vcov = sandwich)                # robust; sandwich

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.628101  0.2043   0.8386    
## x           1.952417   0.078525 24.8638   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(m.01, vcov = vcovHC(m.01, "HC0"))     # robust; HC0

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.628101  0.2043   0.8386    
## x           1.952417   0.078525 24.8638   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ROBUST STANDARD ERRORS

# check that the default robust var-cov matrix is HC3
coeftest(m.01, vcov = vcovHC(m.01))            # robust; HC3

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.770230  0.1982   0.8433    
## x           1.952417   0.080959 24.1162   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(m.01, vcov = vcovHC(m.01, "HC3"))     # robust; HC3 (default)

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.945414   4.770230  0.1982   0.8433    
## x           1.952417   0.080959 24.1162   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Regression in R