library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
schools <- read.csv("./schools.csv")
- AIC comparison shows that the model with a trend looks better, and there is definitely variation between the schools
- The multilevel models don’t help here
library(lme4)
## Loading required package: Matrix
base <- glm(cbind(applications, population) ~ 1, data = schools, family = "binomial")
fit.nml <- glm(cbind(applications, population) ~ 1 + school, data = schools, family = "binomial")
fit.nml.wtrend <- glm(cbind(applications, population) ~ 1 + school * year, data = schools, family = "binomial")
fit.ml.wtrend <- glmer(cbind(applications, population) ~ 1 + scale(year) + (1 + scale(year)| school), data = schools, family = "binomial")
fit.ml <- glmer(cbind(applications, population) ~ 1 + (1| school), data = schools, family = "binomial")
AIC(base, fit.nml, fit.nml.wtrend, fit.ml.wtrend, fit.ml)
## df AIC
## base 1 13912.785
## fit.nml 224 6474.879
## fit.nml.wtrend 448 6375.948
## fit.ml.wtrend 5 6887.382
## fit.ml 2 7017.367
Comparison of predictions
Gaussian model without trend
fit <- glm(applications ~ 1 + school, data = schools %>% filter(year < 2016), family = "gaussian")
preds <- schools %>% mutate(
expected_applications = predict(fit, schools)
) %>% filter(year == 2016)
mean((preds$expected_applications - preds$applications) ** 2)
## [1] 29.71987
Gaussian model with trend, multilevel
fit <- glmer(applications ~ 1 + scale(year) + (1 + scale(year) | school), data = schools %>% filter(year < 2016), family = "gaussian")
## Warning in glmer(applications ~ 1 + scale(year) + (1 + scale(year) |
## school), : calling glmer() with family=gaussian (identity link) as a
## shortcut to lmer() is deprecated; please call lmer() directly
preds <- schools %>% mutate(
expected_applications = predict(fit, schools)
) %>% filter(year == 2016)
mean((preds$expected_applications - preds$applications) ** 2)
## [1] 29.51741
Binomial model
fit <- glm(cbind(applications, population) ~ 1 + school, data = schools %>% filter(year < 2016), family = "binomial")
preds <- schools %>% mutate(
predicted_proportion = predict(fit, schools, type = "response"),
expected_applications = population * predicted_proportion
) %>% filter(year == 2016)
mean((preds$expected_applications - preds$applications) ** 2)
## [1] 30.97042
ggplot(preds, aes(x = applications, y = expected_applications)) + geom_point() + labs(
title = "Predictions vs. actual for 2016, model without yearly trend"
)
- The model without the trend has better predictions. A trend based on 3 years is maybe a bit too noisy.
Multilevel trend estimation
model <- glmer(cbind(applications, population) ~ 1 + scale(year) + (1 + scale(year) | school),
data = schools %>% filter(year < 2016),
family = "binomial")
preds <- schools %>% mutate(
predicted_proportion = predict(model, schools, type = "response"),
expected_applications = population * predicted_proportion
) %>% filter(year == 2016)
mean((preds$expected_applications - preds$applications) ** 2)
## [1] 35.79241
ggplot(preds, aes(x = applications, y = expected_applications)) + geom_point() + labs(
title = "Predictions vs. actual for 2016 with pooled trend"
)