Smarket <- Smarket %>%
rownames_to_column(var = "day") %>%
as_tibble()3 Classification
4 ISLR: The Stock Market Data
4.1 Summary Statistics
We will begin by doing some descriptive analysis of the Smarket data, which is part of the ISLR2 library. This data set consists of percentage returns for the S&P 500 stock index over 1,250 days, from the beginning of 2001 until the end of 2005.
For each date, we have recorded the percentage returns for each of the five previous trading days,
Lag1throughLag5.It also contain a variable called
Directionwhich has the two labels"Up"and"Down".
| day | Year | Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 2001 | 0.381 | -0.192 | -2.624 | -1.055 | 5.010 | 1.1913 | 0.959 | Up |
| 2 | 2001 | 0.959 | 0.381 | -0.192 | -2.624 | -1.055 | 1.2965 | 1.032 | Up |
| 3 | 2001 | 1.032 | 0.959 | 0.381 | -0.192 | -2.624 | 1.4112 | -0.623 | Down |
| 4 | 2001 | -0.623 | 1.032 | 0.959 | 0.381 | -0.192 | 1.2760 | 0.614 | Up |
| 5 | 2001 | 0.614 | -0.623 | 1.032 | 0.959 | 0.381 | 1.2057 | 0.213 | Up |
| 6 | 2001 | 0.213 | 0.614 | -0.623 | 1.032 | 0.959 | 1.3491 | 1.392 | Up |
Smarket %>%
dplyr::select(-day, -Year) %>%
tbl_summary(by = Direction, statistic = list(
all_continuous() ~ "{mean} ({sd})"),
digits = all_continuous() ~ 3)| Characteristic | Down, N = 6021 | Up, N = 6481 |
|---|---|---|
| Lag1 | 0.051 (1.141) | -0.040 (1.131) |
| Lag2 | 0.032 (1.157) | -0.022 (1.117) |
| Lag3 | -0.006 (1.148) | 0.008 (1.130) |
| Lag4 | -0.003 (1.175) | 0.006 (1.105) |
| Lag5 | -0.001 (1.126) | 0.012 (1.168) |
| Volume | 1.470 (0.360) | 1.486 (0.361) |
| Today | -0.858 (0.754) | 0.803 (0.796) |
| 1 Mean (SD) | ||
Let us take a look at the correlation between the variables:
correlation <- round(cor(Smarket[,2:9]),2)Code
corrplot(correlation, type = 'lower', method="color",
tl.col = 'black', tl.srt = 45, # text label
addCoef.col = "black", # coefficients
col = COL2('BrBG'), diag=FALSE)We see some positive correlation between Year and Volume. Figure 4.2 confirms the upward trend between these two variables:
4.2 Logistic Regression
Next, we will fit a logistic regression model in order to predict Direction using Lag1 through Lag5 and Volume:
Smarket <- Smarket %>%
mutate(Direction1 = ifelse(Direction == "Down",0,1))glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial(link="logit"))Code
summary(glm.fit)$coef %>%
round(3) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))4.2.1 Prediction
Predictions are done much the same way. Here we use the model to predict on the data it was trained on.
# Predicted x'beta
xb <- predict(glm.fit, type = "link", newdata = Smarket)
# Predicted probability
prob <- predict(glm.fit, type = "response", newdata = Smarket)head(cbind(xb,prob)) %>%
round(3) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| xb | prob |
|---|---|
| 0.028 | 0.507 |
| -0.074 | 0.481 |
| -0.075 | 0.481 |
| 0.061 | 0.515 |
| 0.043 | 0.511 |
| 0.028 | 0.507 |
We can build a confusion matrix to assess prediction accuracy:
pred_Smarket <- cbind(Smarket, xb, prob)
pred_Smarket$Prediction <- rep("Down", 1250)
pred_Smarket$Prediction[prob > 0.5] <- "Up"pred_Smarket %>%
mutate(Prediction = as.factor(Prediction)) %>%
conf_mat(truth = Direction, estimate = Prediction) Truth
Prediction Down Up
Down 145 141
Up 457 507
Alternatively, the package yardstick provides some useful functions to evaluate model performance. For example, we can get the acuracy():
Code
pred_Smarket %>%
mutate(Prediction = as.factor(Prediction)) %>%
accuracy(truth = Direction, estimate = Prediction) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| .metric | .estimator | .estimate |
|---|---|---|
| accuracy | binary | 0.5216 |
What is the percentage of negatives (in this case, Direction=Up) correctly identified? That is defined by the Specificity metric:
Code
pred_Smarket %>%
mutate(Prediction = as.factor(Prediction)) %>%
spec(truth = Direction, estimate = Prediction) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| .metric | .estimator | .estimate |
|---|---|---|
| spec | binary | 0.7824074 |
We can use the confusionMatrix() function from the caret package to have a more detailed assessment:
cm <- confusionMatrix(factor(pred_Smarket$Prediction),
factor(pred_Smarket$Direction),
dnn = c("Prediction", "Direction"))
cmConfusion Matrix and Statistics
Direction
Prediction Down Up
Down 145 141
Up 457 507
Accuracy : 0.5216
95% CI : (0.4935, 0.5496)
No Information Rate : 0.5184
P-Value [Acc > NIR] : 0.4216
Kappa : 0.0237
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.2409
Specificity : 0.7824
Pos Pred Value : 0.5070
Neg Pred Value : 0.5259
Prevalence : 0.4816
Detection Rate : 0.1160
Detection Prevalence : 0.2288
Balanced Accuracy : 0.5116
'Positive' Class : Down
A more visual representation of the confusion matrix can be done by generating a ggplot2 chart:
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))4.2.2 Out-Of-Sample Performance
We have just said something about the training error rate. As we have seen previously, the training error rate is often overly optimistic—it tends to underestimate the test error rate.
In order to better assess the accuracy of the logistic regression model in this setting, we can fit the model using part of the data, and then examine how well it predicts the held out data.
Since we are working with some data that has a time component, it is natural to fit the model using the observations form 2001 to 2004 and evaluate it on the last year of 2005. This would more closely match how such a model would be used in real life:
Smarket_train <- Smarket %>%
filter(Year <= 2004)
Smarket_test <- Smarket %>%
filter(Year == 2005)glm_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket_train,
family = binomial)Predicting using the test data:
prob <- predict(glm_fit, type = "response", newdata = Smarket_test)
pred_Smarket_test <- cbind(Smarket_test, prob)
pred_Smarket_test$Prediction <- rep("Down", nrow(Smarket_test))
pred_Smarket_test$Prediction[prob > 0.5] <- "Up"confusion_test1 <- table(Prediction = factor(pred_Smarket_test$Prediction),
Direction = factor(pred_Smarket_test$Direction))Accuracy: 0.48
The results are rather disappointing: the accuracy is 48%, which is worse than random guessing!
Perhaps by removing the variables that appear not to be helpful in predicting Direction, we can obtain a more effective model.
glm_fit_minimal <- glm(Direction ~ Lag1 + Lag2, data = Smarket_train,
family = binomial)prob <- predict(glm_fit_minimal, type = "response", newdata = Smarket_test)
pred_Smarket_test <- cbind(Smarket_test, prob)
pred_Smarket_test$Prediction <- rep("Down", nrow(Smarket_test))
pred_Smarket_test$Prediction[prob > 0.5] <- "Up"confusion_test2 <- table(Prediction = factor(pred_Smarket_test$Prediction),
Direction = factor(pred_Smarket_test$Direction))Accuracy: 0.56
The confusion matrix shows that when logistic regression predicts an increase in the market, it has a 56% accuracy rate. These results are a little bit better.
4.3 Linear Discriminant Analysis
lda.fit <- MASS::lda(Direction ~ Lag1 + Lag2, data = Smarket_train)
plot(lda.fit)Predictions are done using the test data:
prediction_lda <- predict(lda.fit, newdata = Smarket_test)
lda_Smarket <- cbind(Smarket_test, Prediction = prediction_lda$class)lda_Smarket %>%
mutate(Prediction = as.factor(Prediction)) %>%
conf_mat(truth = Direction, estimate = Prediction) Truth
Prediction Down Up
Down 35 35
Up 76 106
Code
cm <- confusionMatrix(factor(lda_Smarket$Prediction),
factor(lda_Smarket$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy: 0.56
Applying a 50% threshold to the posterior probabilities allows us to recreate the predictions contained in lda.pred$class.
sum(prediction_lda$posterior[, 1] >= 0.5)[1] 70
sum(prediction_lda$posterior[, 1] < 0.5)[1] 182
4.4 Quadratic Discriminant Analysis
We will now fit a QDA model:
qda.fit <- MASS::qda(Direction ~ Lag1 + Lag2, data = Smarket_train)prediction_qda <- predict(qda.fit, newdata = Smarket_test)
qda_Smarket <- cbind(Smarket_test, Prediction = prediction_qda$class)Code
cm <- confusionMatrix(factor(qda_Smarket$Prediction),
factor(qda_Smarket$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy: 0.599
4.5 Naive Bayes
Next, we fit a naive Bayes model to the Smarket data. Naive Bayes is implemented in R using the naiveBayes() function, which is part of the e1071 library.
nbayes.fit <- naiveBayes(Direction ~ Lag1 + Lag2, data = Smarket_train)prediction_nbayes <- predict(nbayes.fit , newdata = Smarket_test)
nbayes_Smarket <- cbind(Smarket_test, Prediction = prediction_nbayes)Code
cm <- confusionMatrix(factor(nbayes_Smarket$Prediction),
factor(nbayes_Smarket$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy: 0.591
The accuracy of the Naive Bayes is very similar to that of the QDA model. This seems reasonable since the below scatter plot shows that there is no apparent relationship between Lag1 vs Lag2 and thus the Naive Bayes’ assumption of independently distributed predictors is not unreasonable.
4.6 K-Nearest Neighbors
We will now perform KNN using the knn() function, which is part of the class library. First we need to re-arrange our train and test data:
train_x <- Smarket_train %>%
dplyr::select(Lag1, Lag2)
test_x <- Smarket_test %>%
dplyr::select(Lag1, Lag2)
train_y <- Smarket_train %>%
dplyr::select(Direction)We initialize the random number generator with set.seed() to ensure that repeated runs produce consistent results and then use knn() to make predictions about the market direction in 2005. I have set it to 3 with neighbors = 3.
set.seed(1)
knn_pred <- knn(as.matrix(train_x),
as.matrix(test_x),
as.matrix(train_y),
k = 3)table(knn_pred, Smarket_test[["Direction"]])
knn_pred Down Up
Down 48 55
Up 63 86
Accuracy: 0.532
It appears that this model is not performing that well.
5 ISLR: Weekly Stock Data
This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
5.1 Summary Statistics
5.1.1 Question (a)
Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
Weekly <- Weekly %>%
rownames_to_column(var = "day") %>%
as_tibble()Weekly %>%
dplyr::select(-day, -Year) %>%
tbl_summary(by = Direction, statistic = list(
all_continuous() ~ "{mean} ({sd})"),
digits = all_continuous() ~ 3)| Characteristic | Down, N = 4841 | Up, N = 6051 |
|---|---|---|
| Lag1 | 0.282 (2.315) | 0.045 (2.387) |
| Lag2 | -0.040 (2.292) | 0.304 (2.399) |
| Lag3 | 0.208 (2.282) | 0.099 (2.423) |
| Lag4 | 0.200 (2.443) | 0.102 (2.293) |
| Lag5 | 0.188 (2.372) | 0.102 (2.354) |
| Volume | 1.609 (1.699) | 1.547 (1.678) |
| Today | -1.747 (1.760) | 1.667 (1.531) |
| 1 Mean (SD) | ||
Let us take a look at the correlation between the variables:
correlation <- round(cor(Weekly[,2:9]),2)Code
corrplot(correlation, type = 'lower', method="color",
tl.col = 'black', tl.srt = 45, # text label
addCoef.col = "black", # coefficients
col = COL2('BrBG'), diag=FALSE)The variable Volume tends to increase in time. Figure 5.2 confirms this upward trend:
5.2 Logistic Regression
5.2.1 Question (b)
Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial(link="logit"))summary(glm.fit)$coef %>%
round(3) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | 0.267 | 0.086 | 3.106 | 0.002 |
| Lag1 | -0.041 | 0.026 | -1.563 | 0.118 |
| Lag2 | 0.058 | 0.027 | 2.175 | 0.030 |
| Lag3 | -0.016 | 0.027 | -0.602 | 0.547 |
| Lag4 | -0.028 | 0.026 | -1.050 | 0.294 |
| Lag5 | -0.014 | 0.026 | -0.549 | 0.583 |
| Volume | -0.023 | 0.037 | -0.616 | 0.538 |
The variable Lag2 appears to have some statistical significance with 3% of significance.
5.2.2 Question (c)
Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
# Predicted x'beta
xb <- predict(glm.fit, type = "link", newdata = Weekly)
# Predicted probability
prob <- predict(glm.fit, type = "response", newdata = Weekly)pred_Weekly <- cbind(Weekly, xb, prob)
pred_Weekly$Prediction <- rep("Down", 1089)
pred_Weekly$Prediction[prob > 0.5] <- "Up"cm <- confusionMatrix(factor(pred_Weekly$Prediction),
factor(pred_Weekly$Direction),
dnn = c("Prediction", "Direction"))Let’s build our confusion matrix in a visual way:
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy:
Accuracy
0.561
5.2.3 Question (d)
Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
Weekly_train <- Weekly %>%
filter(Year <= 2008)
Weekly_test <- Weekly %>%
filter(Year > 2008)glm_fit <- glm(Direction ~ Lag2,
data = Weekly_train,
family = binomial)prob <- predict(glm_fit, type = "response", newdata = Weekly_test)
pred_Weekly_test <- cbind(Weekly_test, prob)
pred_Weekly_test$Prediction <- rep("Down", nrow(Weekly_test))
pred_Weekly_test$Prediction[prob > 0.5] <- "Up"Code
cm <- confusionMatrix(factor(pred_Weekly_test$Prediction),
factor(pred_Weekly_test$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy
0.625
5.3 LDA, QDA, KNN & Naive Bayes
5.3.1 Question (e) to (h)
LDA:
lda.fit <- MASS::lda(Direction ~ Lag2, data = Weekly_train)
plot(lda.fit)Predictions are done using the test data:
prediction_lda <- predict(lda.fit, newdata = Weekly_test)
lda_Weekly <- cbind(Weekly_test, Prediction = prediction_lda$class)Code
cm <- confusionMatrix(factor(lda_Weekly$Prediction),
factor(lda_Weekly$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy: 0.625
QDA:
We will now fit a QDA model:
qda.fit <- MASS::qda(Direction ~ Lag2, data = Weekly_train)prediction_qda <- predict(qda.fit, newdata = Weekly_test)
qda_Weekly <- cbind(Weekly_test, Prediction = prediction_qda$class)Code
cm <- confusionMatrix(factor(qda_Weekly$Prediction),
factor(qda_Weekly$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy: 0.587
KNN (k=1):
We will now perform KNN using the knn() function, which is part of the class library. First we need to re-arrange our train and test data:
train_x <- Weekly_train %>%
dplyr::select(Lag2)
test_x <- Weekly_test %>%
dplyr::select(Lag2)
train_y <- Weekly_train %>%
dplyr::select(Direction)We initialize the random number generator with set.seed() to ensure that repeated runs produce consistent results and then use knn() to make predictions about the market direction in 2005. I have set it to 3 with neighbors = 3.
set.seed(1)
knn_pred <- knn(as.matrix(train_x),
as.matrix(test_x),
as.matrix(train_y),
k = 1)table(knn_pred, Weekly_test[["Direction"]])
knn_pred Down Up
Down 21 30
Up 22 31
Accuracy: 0.5
It appears that this model is not performing that well.
Naive Bayes:
nbayes.fit <- naiveBayes(Direction ~ Lag2, data = Weekly_train)prediction_nbayes <- predict(nbayes.fit , newdata = Weekly_test)
nbayes_Weekly <- cbind(Weekly_test, Prediction = prediction_nbayes)Code
cm <- confusionMatrix(factor(nbayes_Weekly$Prediction),
factor(nbayes_Weekly$Direction),
dnn = c("Prediction", "Direction"))
plt <- as.data.frame(cm$table)
plotTable <- plt %>%
mutate(Correct = ifelse(plt$Prediction == plt$Direction, "TRUE", "FALSE")) %>%
group_by(Direction) %>%
mutate(prop = Freq/sum(Freq))Code
ggplot(plotTable, aes(x = Direction, y = Prediction, fill = Correct, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c("TRUE" = "#8F95D3", "FALSE" = "#DBB1BC")) +
theme_bw() +
xlim(rev(levels(plt$Direction)))Accuracy: 0.587
5.4 Model Comparison
5.4.1 Question (i)
Which of these methods appears to provide the best results on this data?
The two best models are Logit and LDA. The accuracy of the Naive Bayes is very similar to that of the QDA model, being both the second best option.
6 DSM: KNN, Logit & LDA
In this question you will produce a picture like Figure 2.2 in the book Elements of Statistical Learning on pp.15, but for a different dataset. Try to understand first the code for this Figure by running the posted file mixture.R, where the generated dataset for Figure 2.2 and the code is given.
6.0.1 Question 1
Generate a dataset consisting of n = 100 observations from the logit model below:
with , , and . Here but you don’t need to impose this in the analysis.
n <- 100
set.seed(666)
# Covariates
x1 <- rnorm(n, mean = 0, sd = 1)
x2 <- rnorm(n, mean = 0, sd = 2)
# Coefficients
b1 <- 2
b2 <- 3
# Single Index
z <- b1*x1 + b2*x2
# Logit Model
pr <- 1/(1+exp(-z))We can now generate our dependent variable:
y <- as.factor(rbinom(n,1,pr))We generate a dataframe:
data <- data.frame(cbind(y, x1, x2))6.0.2 Question 2
Plot the data in the two dimensions and , using orange and blue circles for the two classes in .
6.0.3 Question 3
The Bayes decision boundary are the points and such that .
For each in the simulated (or training) data, calculate such that and add the Bayes decision boundary on the plot in b) using a dashed purple line. Is this boundary linear and can you find the exact formula for it? Explain.
We have that:
which means that the odds ratio is . This implies that . Finally, we can get the relationship between the covariates, which is defined by a straight line:
Let’s generate our data now:
data <- cbind(data, bayes = -b1*x1/b2)data %>%
ggplot(aes(x = x1)) +
geom_point(aes(y = x2, color = as.factor(y)), size = 3, alpha = 0.65) +
geom_line(aes(y = bayes), size = 1, color = "purple", linetype = 6, alpha = 0.9) +
theme_bw() + xlab("X1") + ylab("X2") +
scale_color_manual(values = c("blue", "orange")) +
theme(legend.position="none")6.0.4 Question 4
Construct a test set on a grid of g = 50 values for and , ranging from their minimum to their maximum. Generate a test set from each combination of and , and call it test. Gather the training set for into a data frame called train.
Let’s start with our train data:
train <- data.frame(cbind(y, x1, x2))Now our test data:
g <- 50
x1_test <- seq(min(x1), max(x1), length.out = g)
x2_test <- seq(min(x2), max(x2), length.out = g)
test <- expand.grid(x1_test, x2_test)6.0.5 Question 5
Run a KNN analysis with k = 3 nearest neighbors on the test data using the training data and the realizations of . Use the command knn().
Let’s scale our covariates first:
s.train <- scale(train[,2:3])
s.test <- scale(test)We run our knn():
knn.fit3 <- knn(s.train, s.test, y, k=3, prob = TRUE)6.0.6 Question 6
Following the mixture.R code, plot the training data with circles, the test data with dots, each with the color blue or orange according to which class they either belong to (in the training data) or to which class they were assigned to (in the test data). Add the KNN decision boundary to the plot using contour() and the Bayes decision boundary.
prob3 <- attr(knn.fit3, "prob")
prob3 <- ifelse(knn.fit3=="1", prob3, 1-prob3) # if class =1 return the prob of 1
probm3 <- matrix(prob3, g, g) par(mar=rep(2,4))
contour(x1_test, x2_test, probm3, levels=0.5, labels="", xlab="X1", ylab="X2", main=
"3-nearest neighbour", axes=FALSE)
points(train[,2:3], col=ifelse(y==1, "orange", "blue"))
lines(data[,c("x1", "bayes")], type="l", lty=2, lwd=2, col="purple")
points(test, pch=".", cex=1.2, col=ifelse(probm3>0.5, "orange", "blue"))
box()6.0.7 Question 7
Repeat e) and f) with k = 10 and k = 15. Calculate the test error rate for each of k = 3; 10; 15, and the Bayes decision boundary. Which k gets closest to the Bayes decision boundary? Explain why this makes sense or not.
Case: k=10:
knn.fit10 <- knn(s.train, s.test, y, k=10, prob = TRUE)
prob10 <- attr(knn.fit10, "prob")
prob10 <- ifelse(knn.fit10=="1", prob10, 1-prob10) # if class =1 return the prob of 1
probm10 <- matrix(prob10, g, g) par(mar=rep(2,4))
contour(x1_test, x2_test, probm10, levels=0.5, labels="", xlab="X1", ylab="X2", main=
"10-nearest neighbour", axes=FALSE)
points(train[,2:3], col=ifelse(y==1, "orange", "blue"))
lines(data[,c("x1", "bayes")], type="l", lty=2, lwd=2, col="purple")
points(test, pch=".", cex=1.2, col=ifelse(probm10>0.5, "orange", "blue"))
box()Case: k=15:
knn.fit15 <- knn(s.train, s.test, y, k=15, prob = TRUE)
prob15 <- attr(knn.fit15, "prob")
prob15 <- ifelse(knn.fit15=="1", prob15, 1-prob15) # if class =1 return the prob of 1
probm15 <- matrix(prob15, g, g) par(mar=rep(2,4))
contour(x1_test, x2_test, probm15, levels=0.5, labels="", xlab="X1", ylab="X2", main=
"15-nearest neighbour", axes=FALSE)
points(train[,2:3], col=ifelse(y==1, "orange", "blue"))
lines(data[,c("x1", "bayes")], type="l", lty=2, lwd=2, col="purple")
points(test, pch=".", cex=1.2, col=ifelse(probm15>0.5, "orange", "blue"))
box()Finally, we can get the test error rate:
# Generate the true classes for test data
z.test = b1*test[,1] + b2*test[,2]
pr.test = 1/(1+exp(-z.test))
y.test = as.factor(rbinom(length(pr.test),1,pr.test))table(knn.fit3,y.test) y.test
knn.fit3 0 1
0 1318 35
1 115 1032
Test Error Rate (k=3): 0.06
table(knn.fit10,y.test) y.test
knn.fit10 0 1
0 1324 20
1 109 1047
Test Error Rate (k=10): 0.0516
table(knn.fit15,y.test) y.test
knn.fit15 0 1
0 1311 21
1 122 1046
Test Error Rate (k=15): 0.0572