Try to solve the tasks first without looking at the answers, and if not finding a solution quickly then check the answers.
Extract data crabs from library MASS, save it as crabs1 and read about these data in help.
library(MASS)
crabs1=crabs
#?crabs
Compute logistic regression model with sex as target and RWâCW measurements as features in data set crabs1, use cost matrix (1 row=F, 2 row=M)
\[\begin{bmatrix} 0 & 2\\ 3 & 0 \end{bmatrix}\]
and report confusion matrix for the classification of the training data.
library(MASS)
library(dplyr)
crabs2=crabs1%>%select(sex, RW:CW)
levels(crabs1$sex)
## [1] "F" "M"
m1=glm(sex~., data=crabs2, family="binomial")#if more than 2 classes use multinom from nnet
ProbM=predict(m1, type="response")
ProbF=1-ProbM
#alternative A: good for 2 classes
Pred=ifelse(ProbF/ProbM>3/2, "F", "M")
table(crabs2$sex, Pred)
## Pred
## F M
## F 96 4
## M 2 98
#alternative B: fits to 2 or more classes
Probs=cbind(ProbF, ProbM)
Losses=Probs%*%matrix(c(0,3,2,0), nrow=2)
bestI=apply(Losses, MARGIN=1, FUN = which.min)
Pred=levels(crabs2$sex)[bestI]
table(crabs2$sex, Pred)
## Pred
## F M
## F 96 4
## M 2 98
Compute a decision tree model with sex as target and all crab measurements as features in data set crabs1, one with default settings and another with 40 as a smallest allowed node size. Compute an aggregated prediction from these two trees and then the confusion matrix.
library(tree)
crabs2=crabs1%>%select(sex, FL:BD)
tree0=tree(as.factor(sex)~., data=crabs2)
tree2=tree(as.factor(sex)~., data=crabs2, control = tree.control(nrow(crabs2), minsize = 40))
Probs=(predict(tree0, newdata=crabs2)+predict(tree2, newdata=crabs2))/2
bestI=apply(Probs, MARGIN=1, FUN = which.max)
Pred=levels(crabs2$sex)[bestI]
table(crabs2$sex, Pred)
## Pred
## F M
## F 96 4
## M 5 95
crabs2=crabs1%>%select(sex, FL:BD)
fullT=tree(as.factor(sex)~., data=crabs2)
Probs=predict(fullT)
Decision=ifelse(Probs[,2]/Probs[,1]>0.7, "M", "F")
tab=table(crabs2$sex, Decision)
TP=tab[1,1]
TN=tab[2,2]
FP=tab[2,1]
FN=tab[1,2]
TPR=TP/(TP+FN)
FPR=FP/(FP+TN)
prec=TP/(TP+FP)
rec=TP/(TP+FN)
cat(TPR,FPR, prec, rec)
## 0.96 0.05 0.950495 0.96
Use tree fullT and find out amount of leaves, how many variables are used by the tree and proportion of the labels in the root node
#for proportions, look in the first row, within brackets
print(fullT)
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 200 277.300 F ( 0.50000 0.50000 )
## 2) RW < 15.9 176 241.200 M ( 0.43750 0.56250 )
## 4) CL < 35.9 136 187.100 F ( 0.55147 0.44853 )
## 8) RW < 13.1 101 135.600 M ( 0.39604 0.60396 )
## 16) CL < 30.15 79 109.500 F ( 0.50633 0.49367 )
## 32) RW < 11.35 59 76.820 M ( 0.35593 0.64407 )
## 64) CL < 25.9 37 50.620 F ( 0.56757 0.43243 )
## 128) RW < 10.55 31 42.940 M ( 0.48387 0.51613 )
## 256) CL < 22.9 23 30.790 F ( 0.60870 0.39130 )
## 512) RW < 9.1 15 20.190 M ( 0.40000 0.60000 )
## 1024) RW < 8.05 7 5.742 M ( 0.14286 0.85714 ) *
## 1025) RW > 8.05 8 10.590 F ( 0.62500 0.37500 ) *
## 513) RW > 9.1 8 0.000 F ( 1.00000 0.00000 ) *
## 257) CL > 22.9 8 6.028 M ( 0.12500 0.87500 ) *
## 129) RW > 10.55 6 0.000 F ( 1.00000 0.00000 ) *
## 65) CL > 25.9 22 0.000 M ( 0.00000 1.00000 ) *
## 33) RW > 11.35 20 7.941 F ( 0.95000 0.05000 )
## 66) CL < 29 15 0.000 F ( 1.00000 0.00000 ) *
## 67) CL > 29 5 5.004 F ( 0.80000 0.20000 ) *
## 17) CL > 30.15 22 0.000 M ( 0.00000 1.00000 ) *
## 9) RW > 13.1 35 0.000 F ( 1.00000 0.00000 ) *
## 5) CL > 35.9 40 15.880 M ( 0.05000 0.95000 )
## 10) CL < 37.1 11 10.430 M ( 0.18182 0.81818 )
## 20) FL < 17.85 6 0.000 M ( 0.00000 1.00000 ) *
## 21) FL > 17.85 5 6.730 M ( 0.40000 0.60000 ) *
## 11) CL > 37.1 29 0.000 M ( 0.00000 1.00000 ) *
## 3) RW > 15.9 24 8.314 F ( 0.95833 0.04167 )
## 6) FL < 21.55 19 0.000 F ( 1.00000 0.00000 ) *
## 7) FL > 21.55 5 5.004 F ( 0.80000 0.20000 ) *
#for variables used and amount of leaves(terminal nodes), look here
summary(fullT)
##
## Classification tree:
## tree(formula = as.factor(sex) ~ ., data = crabs2)
## Variables actually used in tree construction:
## [1] "RW" "CL" "FL"
## Number of terminal nodes: 15
## Residual mean deviance: 0.2113 = 39.09 / 185
## Misclassification error rate: 0.045 = 9 / 200
Use cross-validation to find optimal amount of leaves in fullT, prune the tree and plot the result.
result=cv.tree(fullT)
leaves=result$size[which.min(result$dev)]
tree1=prune.tree(fullT, best=leaves)
plot(tree1)
text(tree1)
Use crab measurements from crab1 to run PCA. Plot data in first 2 PC coordinates, answer which component has greatest contribution to PC1, provide equation of PC1 in the original coordinates, compress the data by using the first two PC.
crabs2=crabs1%>%select(FL:BD)
res2=princomp(crabs2)
plot(res2$scores[,1], res2$scores[,2])
colnames(res2$loadings)[which.max(res2$loadings[,1])]
## [1] "Comp.4"
res2$loadings[,1]
## FL RW CL CW BD
## 0.2889810 0.1972824 0.5993986 0.6616550 0.2837317
#PC1=0.29FL+0.2RW+0.6CL+0.66CW+0.28BD
compressed=res2$scores[,1:2]%*%t(res2$loadings[,1:2])+
matrix(colMeans(crabs2), nrow=nrow(crabs2), ncol=ncol(crabs2), byrow=T)
Use crab measurements from crab1 to run Lasso regression for with target \(CW\) and all other crab measurements as features and \(lambda=1\). Compute the amount of features selected and report the predictive equation.
crabs2=crabs1%>%select(FL:BD)
library(glmnet)
x=as.matrix(crabs2%>%select(-CW))
y=as.matrix(crabs2%>%select(CW))
mB=glmnet(x, y,family="gaussian", lambda = 1, alpha=1)
coef(mB)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 5.6110801
## FL .
## RW .
## CL 0.9594437
## BD .
1 feature selected and predictive equation \(CW=5.61+0.96CL\)
Use crab measurements from crab1 to run cross-validated Lasso regression with target \(CW\) and all other crab measurements as features. Report the optimal penalty.
crabs2=crabs1%>%select(FL:BD)
library(glmnet)
x=as.matrix(crabs2%>%select(-CW))
y=as.matrix(crabs2%>%select(CW))
model=cv.glmnet(x, y, alpha=1,family="gaussian")
model$lambda.min
## [1] 0.05140542
Assuming \(MSE_{train}=(w-1)^2+(w-2)^2\) and \(MSE_{test}=(w-4)^2+(w+1)^2\), use BFGS optimization to plot the training and test MSE dependencies on iteration number. Is early stopping needed?
TestE=list()
TrainE=list()
k=0
mseTrain= function(w){
MSE_train=(w-1)^2+(w-2)^2
MSE_test=(w-4)^2+(w+1)^2
.GlobalEnv$k= .GlobalEnv$k+1
.GlobalEnv$TrainE[[k]]=MSE_train
.GlobalEnv$TestE[[k]]=MSE_test
return(MSE_train)
}
res=optim(c(0), fn=mseTrain, method="BFGS")
plot(as.numeric(TrainE), type="l", col="blue", ylim=c(0,60), ylab="Error")
points(as.numeric(TestE), type="l", col="red")
Early stopping can be done at 8th iteration to get the smallest test error.