MACS 30500
University of Chicago
##
## Call:
## glm(formula = Survived ~ Age * Sex, family = binomial, data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9401 -0.7136 -0.5883 0.7626 2.2455
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.59380 0.31032 1.913 0.05569 .
## Age 0.01970 0.01057 1.863 0.06240 .
## Sexmale -1.31775 0.40842 -3.226 0.00125 **
## Age:Sexmale -0.04112 0.01355 -3.034 0.00241 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 740.40 on 710 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 748.4
##
## Number of Fisher Scoring iterations: 4
library(modelr)
titanic_split <- resample_partition(titanic, c(test = 0.3, train = 0.7))
map(titanic_split, dim)
## $test
## [1] 267 12
##
## $train
## [1] 624 12
train_model <- glm(Survived ~ Age + Sex, data = titanic_split$train,
family = binomial)
summary(train_model)
##
## Call:
## glm(formula = Survived ~ Age + Sex, family = binomial, data = titanic_split$train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7661 -0.6833 -0.6459 0.7424 1.9271
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.336048 0.278575 4.796 1.62e-06 ***
## Age -0.006271 0.007577 -0.828 0.408
## Sexmale -2.521495 0.223496 -11.282 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 672.23 on 496 degrees of freedom
## Residual deviance: 516.10 on 494 degrees of freedom
## (127 observations deleted due to missingness)
## AIC: 522.1
##
## Number of Fisher Scoring iterations: 4
x_test_accuracy <- titanic_split$test %>%
tbl_df() %>%
add_predictions(train_model) %>%
mutate(pred = logit2prob(pred),
pred = as.numeric(pred > .5))
mean(x_test_accuracy$Survived == x_test_accuracy$pred, na.rm = TRUE)
## [1] 0.7695853
\[MSE = \frac{1}{n} \sum_{i = 1}^{n}{(y_i - \hat{f}(x_i))^2}\]
\[CV_{(n)} = \frac{1}{n} \sum_{i = 1}^{n}{MSE_i}\]
titanic_model <- glm(Survived ~ Age * Sex, data = titanic,
family = binomial)
titanic_loocv <- titanic %>%
filter(!is.na(Survived), !is.na(Age), !is.na(Sex)) %>%
cv.glm(titanic_model)
titanic_loocv$delta[[1]]
## [1] 0.1703518
$$CV_{(k)} = \frac{1}{k} \sum_{i = 1}^{k}{MSE_i}$$
titanic_kfold <- titanic %>%
filter(!is.na(Survived), !is.na(Age), !is.na(Sex)) %>%
cv.glm(titanic_model, K = 10)
titanic_kfold$delta[[1]]
## [1] 0.1708052