# NAIVE SIMULATION
= 200
N = 0.3
smd
= rnorm(N/2, 0, 1)
scores0 = rnorm(N/2, 0+smd, 1)
scores1 = c(scores0, scores1)
scores
= rep("group0", N/2)
group0 = rep("group1", N/2)
group1 = c(group0, group1)
group
= data.frame(group, scores) df
Examples of Monte Carlo Simulations
Basics of R for Data Science
Linear model, just testing a between-group difference, but inference is based on model-comparison using the AIC
A two-group comparison is as straightforward as a t-test. For power analysis, the pwr.t.test()
function from the pwr
package is sufficient. However, to add some complexity and justify using Monte Carlo simulation in this basic scenario, let’s consider making inferences based on the best-fitting model using Akaike Information Criterion (AIC).
What is reported below is basically a series of increasingly more sophisticated ways of doing the same thing. Just stop where it is enough for you!
Data simulation
This might be a naive way of computing scores for two groups to compare:
However, the following is better because it entails understanding the logic of the underlying statistical (linear) model; note how the dependent variable scores
is modeled using the formula \(y = b0 + b1*x + residuals\)
# MORE COMPETENT SIMULATION
= 200
N = 0.3
smd
= rep(0:1, each=N/2)
group = 0 + smd * group + rnorm(N, 0, 1)
scores
= ifelse(group==0, "group0", "group1") # optional
group
= data.frame(group, scores) df
Let’s run the simulation with a for
loop!
= 200
N = 0.3
smd
= 1e4
niter = data.frame(iter = 1:niter,
results AICnull = NA,
AICaltern = NA)
for(i in 1:niter){
= rep(0:1, each=N/2)
group = 0 + smd * group + rnorm(N, 0, 1)
scores = data.frame(group, scores)
df = lm(scores ~ 1, data=df)
fitNull = lm(scores ~ group, data=df)
fitAltern $AICnull[i] = AIC(fitNull)
results$AICaltern[i] = AIC(fitAltern)
results
}
mean(results$AICaltern < results$AICnull)
[1] 0.7673
The same, but with replicate()
While using the following in alternative to a for
loop may look less readable in this case, it has a few advantages, including: 1) readability because it wraps potentially long sequences of code into separate custom functions; 2) it facilitates parallelization (while the for
loop sticks to a sequential logic).
# first define a custom function for data generation
= function(N=NA, smd=NA){
simStudy = rep(0:1, each=N/2)
group = 0 + smd * group + rnorm(N, 0, 1)
scores = data.frame(group, scores)
df = lm(scores ~ 1, data=df)
fitNull = lm(scores ~ group, data=df)
fitAltern return(list(AICnull=AIC(fitNull), AICaltern=AIC(fitAltern)))
}
# now run the loop
= 1e4
niter = replicate(niter, simStudy(N=200, smd=0.3))
x
# finally extract results and compute power
= unlist(x["AICaltern",])
AICaltern = unlist(x["AICnull",])
AICnull
mean(AICaltern < AICnull)
[1] 0.7525
The same, but extending to multiple sample sizes and multiple effect sizes
# set parameters and initialize results
= expand.grid(N = c(50,100,200,400),
results smd = c(0.2, 0.4),
power = NA)
# now run the loop(s)
= 1000
niter for(i in 1:nrow(results)){
= replicate(niter, simStudy(results$N[i], results$smd[i]))
x = unlist(x["AICaltern",])
AICaltern = unlist(x["AICnull",])
AICnull $power[i] = mean(AICaltern < AICnull)
results
}
# plot
library(ggplot2)
$smd = as.factor(results$smd)
results
ggplot(results, aes(x=N, y=power, group=smd, color=smd, linetype=smd, shape=smd)) +
geom_point(size=5) +
geom_line(size=1) +
theme(text=element_text(size=22)) +
geom_hline(yintercept=0.8, size=1, linetype=2)+
ylab("power (with AIC)")
The same, but more compact with mapply()
# set parameters and initialize results
= 1000
niter = expand.grid(N = c(50,100,200,400),
allIters smd = c(0.2, 0.4),
AICaltern = NA,
AICnull = NA,
success = NA)
= allIters[rep(1:nrow(allIters), each = niter), ]
allIters
# now run the loop(s)
= mapply(simStudy, N=allIters$N, smd=allIters$smd)
x
# compute results
$AICaltern = unlist(x["AICaltern", ])
allIters$AICnull = unlist(x["AICnull", ])
allIters$success = allIters$AICaltern < allIters$AICnull
allIters= aggregate(allIters$success, by=list(N=allIters$N, smd=allIters$smd), FUN=mean)
results
# show results
results
N smd x
1 50 0.2 0.255
2 100 0.2 0.356
3 200 0.2 0.504
4 400 0.2 0.712
5 50 0.4 0.522
6 100 0.4 0.709
7 200 0.4 0.921
8 400 0.4 0.997