Everyone is welcome here --- except those who have borrowed books from me for and have not returned them yet!

Easy Anova with R

Posted on August 25, 2012 in stats

This document provides a few examples of Analyses of Variance for typical experimental designs. We recommend to use of the ez package to perform ANOVAs with R.

require(ez)
## Loading required package: ez
require(lattice)
## Loading required package: lattice
require(ggplot2)
## Loading required package: ggplot2

S Design (one-way ANOVA between subjects)

We start by analyzing a design where the factor Subject is nested within the factor Group, that is, we want to compare independent groups.

ns <- c(20, 30, 20, 15)
means <- c(100, 110, 100, 130)
sd <- c(20, 20, 30, 25)

dat1 <- data.frame(
    subj=factor(1:sum(ns)),
    group=factor(rep(1:4, ns)),
    y=c(rnorm(ns[1], means[1], sd[1]),
        rnorm(ns[2], means[2], sd[2]),
        rnorm(ns[3], means[3], sd[3]),
        rnorm(ns[4], means[4], sd[4])))

Let us inspect the data:

str(dat1)
## 'data.frame':    85 obs. of  3 variables:
##  $ subj : Factor w/ 85 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ group: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y    : num  60.4 89 83.4 82 127.5 ...
head(dat1)
##   subj group         y
## 1    1     1  60.37695
## 2    2     1  89.02115
## 3    3     1  83.43648
## 4    4     1  82.02231
## 5    5     1 127.46990
## 6    6     1  93.97232

Visual inspection

Different graphics can be used, the best choiice depends on the number of data points. For example, dotplots are good when there are not too many points:

require(lattice)
with(dat1, dotplot(y ~ group))

with(dat1, boxplot(y ~ group))

require(ggplot2)

ggplot(dat1, aes(group, y)) +
    geom_violin() + geom_dotplot(binaxis='y', stackdir='center', dotsize=.5)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

require(ez)

ezPlot(dat1, y, wid=subj, between=group, x=.(group))
## Warning: Data is unbalanced (unequal N per group). Make sure you specified
## a well-considered value for the type argument to ezANOVA().
## Warning in ezStats(data = data, dv = dv, wid = wid, within = within,
## within_full = within_full, : Unbalanced groups. Mean N will be used in
## computation of FLSD

Anova

ezANOVA(dat1, dv=y, wid=subj, between=group)
## Warning: Data is unbalanced (unequal N per group). Make sure you specified
## a well-considered value for the type argument to ezANOVA().
## $ANOVA
##   Effect DFn DFd        F          p p<.05       ges
## 1  group   3  81 5.028756 0.00301846     * 0.1570075
## 
## $`Levene's Test for Homogeneity of Variance`
##   DFn DFd      SSn      SSd        F         p p<.05
## 1   3  81 895.5602 19062.87 1.268441 0.2907774

To conduct post-hoc tests comparing all pairs of means:

hsd <- TukeyHSD(aov(y ~ group, data=dat1))
print(hsd)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = y ~ group, data = dat1)
## 
## $group
##          diff        lwr      upr     p adj
## 2-1 12.023670  -5.912702 29.96004 0.3008687
## 3-1  6.006819 -13.641492 25.65513 0.8533133
## 4-1 30.172876   8.950274 51.39548 0.0019756
## 3-2 -6.016851 -23.953223 11.91952 0.8152268
## 4-2 18.149206  -1.499105 37.79752 0.0807751
## 4-3 24.166057   2.943455 45.38866 0.0191370
plot(hsd)

If you worry about heteroscedasticity, you can check the diagnostic plots of lm:

par(mfcol=c(2, 2))
plot(lm(y ~ group, data=dat1))

If heteroscedasticity is severe, it will be visible on the plots of the “residuals vs. fitted” plot.

S*T Design (one-way Anova within subjects) ==========================================

In this design, the factor ‘Subject’ is crossed with a factor ‘Treatment’, that is, there is one data point per subject and treatment.

ns <- 20
means <- c(100, 110, 100, 130)
nt <- length(means)
sd <- 20

dat2 <- data.frame(
    subj=gl(ns, nt),
    treat=gl(nt, 1, nt*ns))

dat2$y <- rep(means, ns) + rep(rnorm(ns, sd=20), each=nt) + rnorm(nt*ns, sd=20)

str(dat2)
## 'data.frame':    80 obs. of  3 variables:
##  $ subj : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ treat: Factor w/ 4 levels "1","2","3","4": 1 2 3 4 1 2 3 4 1 2 ...
##  $ y    : num  71.6 104.5 109.1 111.3 119.2 ...
head(dat2)
##   subj treat        y
## 1    1     1  71.6135
## 2    1     2 104.5033
## 3    1     3 109.1391
## 4    1     4 111.3347
## 5    2     1 119.1887
## 6    2     2 148.2882

We first plot the raw data, per subject:

require(lattice)
xyplot(y ~ treat | subj, data=dat2, type='b')

Or on a single plot (in the right panel, the subject effects are removed):

par(mfcol=c(1,2))
with(dat2, interaction.plot(treat, subj, y))
with(dat2, interaction.plot(treat, subj, y-tapply(y, subj, mean)[subj]))

And we perform the Anova with ez:

ezPlot(dat2, dv=y, wid=subj, within=treat, x=.(treat))

ezANOVA(dat2, dv=y, wid=subj, within=treat)
## $ANOVA
##   Effect DFn DFd        F            p p<.05       ges
## 2  treat   3  57 8.937576 6.039941e-05     * 0.1329535
## 
## $`Mauchly's Test for Sphericity`
##   Effect         W        p p<.05
## 2  treat 0.7796895 0.492553      
## 
## $`Sphericity Corrections`
##   Effect       GGe        p[GG] p[GG]<.05       HFe       p[HF] p[HF]<.05
## 2  treat 0.8494629 0.0001768661         * 0.9921576 6.38662e-05         *

S Design (Two-way between subjects Anova)

Generation of a dataset S<A3*B2>

subject = factor(paste('sub', 1:30, sep=''))
A = gl(3 ,10, 30, labels=c('a1','a2','a3'))
B = gl(2, 5, 30, labels=c('b1', 'b2'))
x = rnorm(30, mean=10) + 1 * (A=='a1' & B=='b2')
dat3 = data.frame(subject, A, B, x)
rm(subject, A, B, x)
attach(dat3)

Classical R approach

table(A, B)
##     B
## A    b1 b2
##   a1  5  5
##   a2  5  5
##   a3  5  5
tapply(x, list(A, B), mean)
##           b1        b2
## a1 10.297974 11.027699
## a2  9.655504 10.652154
## a3  9.865829  9.541259
interaction.plot(A,B,x)

summary(aov(x ~ A * B, data=dat3))
##             Df Sum Sq Mean Sq F value Pr(>F)  
## A            2  4.607  2.3035   3.159 0.0605 .
## B            1  1.638  1.6375   2.246 0.1470  
## A:B          2  2.440  1.2202   1.674 0.2087  
## Residuals   24 17.499  0.7291                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Using ez

ezPlot(data=dat3,  dv=.(x), wid=.(subject), between=.(A,B), 
       x=.(B), split=.(A))

ezANOVA(data=dat3, dv=x, wid=subject, between=c('A','B'))
## $ANOVA
##   Effect DFn DFd        F          p p<.05        ges
## 1      A   2  24 3.159327 0.06053629       0.20840814
## 2      B   1  24 2.245971 0.14700205       0.08557395
## 3    A:B   2  24 1.673517 0.20874318       0.12239109
## 
## $`Levene's Test for Homogeneity of Variance`
##   DFn DFd      SSn      SSd         F         p p<.05
## 1   5  24 1.619963 8.442381 0.9210464 0.4844069
detach(dat3)

SAB Design (Two-way within-subject Anova)

Same dataset but with A & B within subject

dat4 <- dat3
subject = gl(5, 1, 30, labels=paste('sub', 1:5, sep=''))
dat4$subject = subject
table(dat4$subject, dat4$A, dat4$B)
## , ,  = b1
## 
##       
##        a1 a2 a3
##   sub1  1  1  1
##   sub2  1  1  1
##   sub3  1  1  1
##   sub4  1  1  1
##   sub5  1  1  1
## 
## , ,  = b2
## 
##       
##        a1 a2 a3
##   sub1  1  1  1
##   sub2  1  1  1
##   sub3  1  1  1
##   sub4  1  1  1
##   sub5  1  1  1
attach(dat4)
## The following object is masked _by_ .GlobalEnv:
## 
##     subject
interaction.plot(A:B, subject, x)

summary(aov(x ~ A*B + Error(subject/(A*B))))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  1.502  0.3754               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)  
## A          2  4.607  2.3035   3.515 0.0803 .
## Residuals  8  5.243  0.6554                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  1.637  1.6375   3.874   0.12
## Residuals  4  1.691  0.4227               
## 
## Error: subject:A:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## A:B        2  2.440   1.220   1.077  0.385
## Residuals  8  9.063   1.133
summary(aov(x ~ A + Error(subject/A), data=dat4, subset= (B=='b1')))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4 0.7465  0.1866               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)
## A          2  1.073  0.5365   0.336  0.724
## Residuals  8 12.758  1.5947
summary(aov(x ~ A + Error(subject/A), data=dat4, subset= (B=='b2')))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  2.446  0.6115               
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)   
## A          2  5.974  2.9872   15.43 0.0018 **
## Residuals  8  1.549  0.1936                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
for (a in levels(A))
  {
  print(paste("Effect of B for A =",a))
  print(summary(aov(x ~ B + Error(subject/(B), data=dat4, subset=(A==a)))))
}
## [1] "Effect of B for A = a1"
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  1.502  0.3754               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  1.637  1.6375   3.874   0.12
## Residuals  4  1.691  0.4227               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 20  21.35   1.068               
## [1] "Effect of B for A = a2"
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  1.502  0.3754               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  1.637  1.6375   3.874   0.12
## Residuals  4  1.691  0.4227               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 20  21.35   1.068               
## [1] "Effect of B for A = a3"
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  4  1.502  0.3754               
## 
## Error: subject:B
##           Df Sum Sq Mean Sq F value Pr(>F)
## B          1  1.637  1.6375   3.874   0.12
## Residuals  4  1.691  0.4227               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 20  21.35   1.068
detach(dat4)
ezPlot(data=dat4,  dv=.(x), wid=.(subject), between=.(A), within=.(B), 
       x=.(B), split=.(A))
## Warning: The column supplied as the wid variable contains non-unique values
## across levels of the supplied between-Ss variables. Automatically fixing
## this by generating unique wid labels.

ezANOVA(data=dat4,
        dv=x,
        wid=subject,
        within=.(A, B))
## $ANOVA
##   Effect DFn DFd        F          p p<.05        ges
## 2      A   2   8 3.514660 0.08027912       0.20840814
## 3      B   1   4 3.874032 0.12040984       0.08557395
## 4    A:B   2   8 1.077058 0.38529324       0.12239109
## 
## $`Mauchly's Test for Sphericity`
##   Effect         W         p p<.05
## 2      A 0.4038535 0.2566468      
## 4    A:B 0.9518376 0.9286333      
## 
## $`Sphericity Corrections`
##   Effect       GGe     p[GG] p[GG]<.05       HFe     p[HF] p[HF]<.05
## 2      A 0.6265089 0.1175679           0.7763227 0.1007354          
## 4    A:B 0.9540506 0.3838172           1.8023113 0.3852932

Split-plot ANOVA (A within, B between)

dat5 <- dat3
subject = gl(10, 1, 30, labels=paste('sub', 1:10, sep=''))
dat5$subject = subject
table(dat5$subject, dat5$A, dat5$B)
## , ,  = b1
## 
##        
##         a1 a2 a3
##   sub1   1  1  1
##   sub2   1  1  1
##   sub3   1  1  1
##   sub4   1  1  1
##   sub5   1  1  1
##   sub6   0  0  0
##   sub7   0  0  0
##   sub8   0  0  0
##   sub9   0  0  0
##   sub10  0  0  0
## 
## , ,  = b2
## 
##        
##         a1 a2 a3
##   sub1   0  0  0
##   sub2   0  0  0
##   sub3   0  0  0
##   sub4   0  0  0
##   sub5   0  0  0
##   sub6   1  1  1
##   sub7   1  1  1
##   sub8   1  1  1
##   sub9   1  1  1
##   sub10  1  1  1
table(dat5$subject, dat5$B:dat5$A)
##        
##         b1:a1 b1:a2 b1:a3 b2:a1 b2:a2 b2:a3
##   sub1      1     1     1     0     0     0
##   sub2      1     1     1     0     0     0
##   sub3      1     1     1     0     0     0
##   sub4      1     1     1     0     0     0
##   sub5      1     1     1     0     0     0
##   sub6      0     0     0     1     1     1
##   sub7      0     0     0     1     1     1
##   sub8      0     0     0     1     1     1
##   sub9      0     0     0     1     1     1
##   sub10     0     0     0     1     1     1
summary(aov(x ~ A*B + Error(subject/A), data=dat5))
## 
## Error: subject
##           Df Sum Sq Mean Sq F value Pr(>F)  
## B          1  1.638  1.6375   4.104 0.0774 .
## Residuals  8  3.192  0.3991                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: subject:A
##           Df Sum Sq Mean Sq F value Pr(>F)
## A          2  4.607  2.3035   2.576  0.107
## A:B        2  2.440  1.2202   1.365  0.284
## Residuals 16 14.306  0.8941
ezANOVA(data=dat5, dv=x, wid=subject, within=.(A), between=.(B))
## $ANOVA
##   Effect DFn DFd        F          p p<.05        ges
## 2      B   1   8 4.103614 0.07736707       0.08557395
## 3      A   2  16 2.576220 0.10717072       0.20840814
## 4    B:A   2  16 1.364641 0.28365531       0.12239109
## 
## $`Mauchly's Test for Sphericity`
##   Effect         W         p p<.05
## 3      A 0.8219254 0.5034003      
## 4    B:A 0.8219254 0.5034003      
## 
## $`Sphericity Corrections`
##   Effect       GGe     p[GG] p[GG]<.05      HFe     p[HF] p[HF]<.05
## 3      A 0.8488427 0.1182849           1.053515 0.1071707          
## 4    B:A 0.8488427 0.2839227           1.053515 0.2836553

S*T Design (Split-plot Anova with two within variables) ==================================

One can have both between and within-subject factors. We consider here the case of a S20<G2>*A4*B2 design where S=subject is nested within a factor Group and crossed with the factors A and B which are also crossed with each other.

dat6 = data.frame(
  subj = gl(20, 8, 160),
  group = gl(2, 80, 160),
  a = gl(4, 1, 160),
  b = gl(2, 4, 160),
  y = rnorm(160, 100, 10))

str(dat6)
## 'data.frame':    160 obs. of  5 variables:
##  $ subj : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 2 2 ...
##  $ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ a    : Factor w/ 4 levels "1","2","3","4": 1 2 3 4 1 2 3 4 1 2 ...
##  $ b    : Factor w/ 2 levels "1","2": 1 1 1 1 2 2 2 2 1 1 ...
##  $ y    : num  91.9 87.7 105.5 89.4 94.2 ...
head(dat6)
##   subj group a b         y
## 1    1     1 1 1  91.93083
## 2    1     1 2 1  87.66202
## 3    1     1 3 1 105.46537
## 4    1     1 4 1  89.39697
## 5    1     1 1 2  94.23032
## 6    1     1 2 2 104.47032
with(dat6, table(subj, group))
##     group
## subj 1 2
##   1  8 0
##   2  8 0
##   3  8 0
##   4  8 0
##   5  8 0
##   6  8 0
##   7  8 0
##   8  8 0
##   9  8 0
##   10 8 0
##   11 0 8
##   12 0 8
##   13 0 8
##   14 0 8
##   15 0 8
##   16 0 8
##   17 0 8
##   18 0 8
##   19 0 8
##   20 0 8
with(dat6, table(subj, a:b))
##     
## subj 1:1 1:2 2:1 2:2 3:1 3:2 4:1 4:2
##   1    1   1   1   1   1   1   1   1
##   2    1   1   1   1   1   1   1   1
##   3    1   1   1   1   1   1   1   1
##   4    1   1   1   1   1   1   1   1
##   5    1   1   1   1   1   1   1   1
##   6    1   1   1   1   1   1   1   1
##   7    1   1   1   1   1   1   1   1
##   8    1   1   1   1   1   1   1   1
##   9    1   1   1   1   1   1   1   1
##   10   1   1   1   1   1   1   1   1
##   11   1   1   1   1   1   1   1   1
##   12   1   1   1   1   1   1   1   1
##   13   1   1   1   1   1   1   1   1
##   14   1   1   1   1   1   1   1   1
##   15   1   1   1   1   1   1   1   1
##   16   1   1   1   1   1   1   1   1
##   17   1   1   1   1   1   1   1   1
##   18   1   1   1   1   1   1   1   1
##   19   1   1   1   1   1   1   1   1
##   20   1   1   1   1   1   1   1   1
xyplot(y ~ a  | subj, group=b, data=dat6, type='b')

signif(with(dat6, tapply(y, list(group=group, a=a, b=b), mean )), 3)
## , , b = 1
## 
##      a
## group    1     2   3    4
##     1 95.4  99.6 100 95.8
##     2 97.5 102.0 101 98.4
## 
## , , b = 2
## 
##      a
## group     1     2   3   4
##     1 101.0  99.7 102 104
##     2  98.5 100.0 102 101
ezPlot(dat6, dv=y, wid=subj, within = .(a, b), between=group, x=.(a), split=b, col=group)
## Warning: Mixed within-and-between-Ss effect requested; FLSD is only
## appropriate for within-Ss comparisons (see warning in ?ezStats or ?ezPlot).