MACS 30500
University of Chicago
\[Y = f(X) + \epsilon\]
\[Y = \beta_0 + \beta_{1}X_1\]
## Classes 'tbl_df', 'tbl' and 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr NA "C85" NA "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 12
## .. ..$ PassengerId: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Survived : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Pclass : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Name : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Sex : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Age : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ SibSp : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Parch : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Ticket : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Fare : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Cabin : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Embarked : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
\[P(\text{survival} = \text{Yes} | \text{age})\]
survive_age <- glm(Survived ~ Age, data = titanic, family = binomial)
summary(survive_age)
##
## Call:
## glm(formula = Survived ~ Age, family = binomial, data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1488 -1.0361 -0.9544 1.3159 1.5908
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05672 0.17358 -0.327 0.7438
## Age -0.01096 0.00533 -2.057 0.0397 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 960.23 on 712 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 964.23
##
## Number of Fisher Scoring iterations: 4
##
## Call:
## glm(formula = Survived ~ Age + Sex, family = binomial, data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7405 -0.6885 -0.6558 0.7533 1.8989
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.277273 0.230169 5.549 2.87e-08 ***
## Age -0.005426 0.006310 -0.860 0.39
## Sexmale -2.465920 0.185384 -13.302 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 749.96 on 711 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 755.96
##
## Number of Fisher Scoring iterations: 4
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender}\]
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender} + \beta_{3}(\text{age} \times \text{gender})\]
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender}\]
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender} + \beta_{3}(\text{age} \times \text{gender})\]
age_accuracy <- titanic %>%
add_predictions(survive_age) %>%
mutate(pred = logit2prob(pred),
pred = as.numeric(pred > .5))
mean(age_accuracy$Survived == age_accuracy$pred, na.rm = TRUE)
## [1] 0.5938375
x_accuracy <- titanic %>%
add_predictions(survive_age_woman_x) %>%
mutate(pred = logit2prob(pred),
pred = as.numeric(pred > .5))
mean(x_accuracy$Survived == x_accuracy$pred, na.rm = TRUE)
## [1] 0.780112
titanic_split <- resample_partition(titanic, c(test = 0.3, train = 0.7))
map(titanic_split, dim)
## $test
## [1] 267 12
##
## $train
## [1] 624 12
train_model <- glm(Survived ~ Age * Sex, data = titanic_split$train,
family = binomial)
summary(train_model)
##
## Call:
## glm(formula = Survived ~ Age * Sex, family = binomial, data = titanic_split$train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9593 -0.7583 -0.5819 0.7579 2.2864
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.62067 0.38039 1.632 0.10275
## Age 0.02000 0.01269 1.576 0.11508
## Sexmale -1.10345 0.47998 -2.299 0.02151 *
## Age:Sexmale -0.04569 0.01579 -2.894 0.00381 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 682.02 on 500 degrees of freedom
## Residual deviance: 529.16 on 497 degrees of freedom
## (123 observations deleted due to missingness)
## AIC: 537.16
##
## Number of Fisher Scoring iterations: 4
x_test_accuracy <- titanic_split$test %>%
tbl_df() %>%
add_predictions(train_model) %>%
mutate(pred = logit2prob(pred),
pred = as.numeric(pred > .5))
mean(x_test_accuracy$Survived == x_test_accuracy$pred, na.rm = TRUE)
## [1] 0.8028169
gganimate
” by David Robinson.