General instructions

Try to solve the tasks first without looking at the answers, and if not finding a solution quickly then check the answers.

Exercise 1

Extract data crabs from library MASS, save it as crabs1 and read about these data in help.

A possible solution
library(MASS)
crabs1=crabs
#?crabs

Exercise 2

Compute logistic regression model with sex as target and RW–CW measurements as features in data set crabs1, use cost matrix (1 row=F, 2 row=M)

\[\begin{bmatrix} 0 & 2\\ 3 & 0 \end{bmatrix}\]

and report confusion matrix for the classification of the training data.

A possible solution
library(MASS)
library(dplyr)
crabs2=crabs1%>%select(sex, RW:CW)

levels(crabs1$sex)
## [1] "F" "M"
m1=glm(sex~., data=crabs2, family="binomial")#if more than 2 classes use multinom from nnet
ProbM=predict(m1, type="response")
ProbF=1-ProbM

#alternative A: good for 2 classes

Pred=ifelse(ProbF/ProbM>3/2, "F", "M")
table(crabs2$sex, Pred)
##    Pred
##      F  M
##   F 96  4
##   M  2 98
#alternative B: fits to 2 or more classes 
Probs=cbind(ProbF, ProbM)
Losses=Probs%*%matrix(c(0,3,2,0), nrow=2)
bestI=apply(Losses, MARGIN=1, FUN = which.min)
Pred=levels(crabs2$sex)[bestI]
table(crabs2$sex, Pred)
##    Pred
##      F  M
##   F 96  4
##   M  2 98

Exercise 3

Compute a decision tree model with sex as target and all crab measurements as features in data set crabs1, one with default settings and another with 40 as a smallest allowed node size. Compute an aggregated prediction from these two trees and then the confusion matrix.

A possible solution
library(tree)
crabs2=crabs1%>%select(sex, FL:BD)
tree0=tree(as.factor(sex)~., data=crabs2)
tree2=tree(as.factor(sex)~., data=crabs2, control = tree.control(nrow(crabs2), minsize = 40))

Probs=(predict(tree0, newdata=crabs2)+predict(tree2, newdata=crabs2))/2
bestI=apply(Probs, MARGIN=1, FUN = which.max)
Pred=levels(crabs2$sex)[bestI]
table(crabs2$sex, Pred)
##    Pred
##      F  M
##   F 96  4
##   M  5 95

Exercise 4

Compute a decision tree model fullT with sex as target and all crab measurements as features in data set crabs1, and for the decision threshold \(p(male)/m(female)=0.7\) compute TPR, FPR, precision, recall. Assume “F” is a positive class.
A possible solution
crabs2=crabs1%>%select(sex, FL:BD)

fullT=tree(as.factor(sex)~., data=crabs2)
Probs=predict(fullT)
Decision=ifelse(Probs[,2]/Probs[,1]>0.7, "M", "F")
tab=table(crabs2$sex, Decision)
TP=tab[1,1]
TN=tab[2,2]
FP=tab[2,1]
FN=tab[1,2]
TPR=TP/(TP+FN)
FPR=FP/(FP+TN)
prec=TP/(TP+FP)
rec=TP/(TP+FN)
cat(TPR,FPR, prec, rec)
## 0.96 0.05 0.950495 0.96

Exercise 5

Use tree fullT and find out amount of leaves, how many variables are used by the tree and proportion of the labels in the root node

A possible solution
#for proportions, look in the first row, within brackets
print(fullT)
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 200 277.300 F ( 0.50000 0.50000 )  
##      2) RW < 15.9 176 241.200 M ( 0.43750 0.56250 )  
##        4) CL < 35.9 136 187.100 F ( 0.55147 0.44853 )  
##          8) RW < 13.1 101 135.600 M ( 0.39604 0.60396 )  
##           16) CL < 30.15 79 109.500 F ( 0.50633 0.49367 )  
##             32) RW < 11.35 59  76.820 M ( 0.35593 0.64407 )  
##               64) CL < 25.9 37  50.620 F ( 0.56757 0.43243 )  
##                128) RW < 10.55 31  42.940 M ( 0.48387 0.51613 )  
##                  256) CL < 22.9 23  30.790 F ( 0.60870 0.39130 )  
##                    512) RW < 9.1 15  20.190 M ( 0.40000 0.60000 )  
##                     1024) RW < 8.05 7   5.742 M ( 0.14286 0.85714 ) *
##                     1025) RW > 8.05 8  10.590 F ( 0.62500 0.37500 ) *
##                    513) RW > 9.1 8   0.000 F ( 1.00000 0.00000 ) *
##                  257) CL > 22.9 8   6.028 M ( 0.12500 0.87500 ) *
##                129) RW > 10.55 6   0.000 F ( 1.00000 0.00000 ) *
##               65) CL > 25.9 22   0.000 M ( 0.00000 1.00000 ) *
##             33) RW > 11.35 20   7.941 F ( 0.95000 0.05000 )  
##               66) CL < 29 15   0.000 F ( 1.00000 0.00000 ) *
##               67) CL > 29 5   5.004 F ( 0.80000 0.20000 ) *
##           17) CL > 30.15 22   0.000 M ( 0.00000 1.00000 ) *
##          9) RW > 13.1 35   0.000 F ( 1.00000 0.00000 ) *
##        5) CL > 35.9 40  15.880 M ( 0.05000 0.95000 )  
##         10) CL < 37.1 11  10.430 M ( 0.18182 0.81818 )  
##           20) FL < 17.85 6   0.000 M ( 0.00000 1.00000 ) *
##           21) FL > 17.85 5   6.730 M ( 0.40000 0.60000 ) *
##         11) CL > 37.1 29   0.000 M ( 0.00000 1.00000 ) *
##      3) RW > 15.9 24   8.314 F ( 0.95833 0.04167 )  
##        6) FL < 21.55 19   0.000 F ( 1.00000 0.00000 ) *
##        7) FL > 21.55 5   5.004 F ( 0.80000 0.20000 ) *
#for variables used and amount of leaves(terminal nodes), look here
summary(fullT)
## 
## Classification tree:
## tree(formula = as.factor(sex) ~ ., data = crabs2)
## Variables actually used in tree construction:
## [1] "RW" "CL" "FL"
## Number of terminal nodes:  15 
## Residual mean deviance:  0.2113 = 39.09 / 185 
## Misclassification error rate: 0.045 = 9 / 200

Exercise 6

Use cross-validation to find optimal amount of leaves in fullT, prune the tree and plot the result.

A possible solution
result=cv.tree(fullT)
leaves=result$size[which.min(result$dev)]
tree1=prune.tree(fullT, best=leaves)
plot(tree1)
text(tree1)


Exercise 7

Use crab measurements from crab1 to run PCA. Plot data in first 2 PC coordinates, answer which component has greatest contribution to PC1, provide equation of PC1 in the original coordinates, compress the data by using the first two PC.

A possible solution
crabs2=crabs1%>%select(FL:BD)
res2=princomp(crabs2)

plot(res2$scores[,1], res2$scores[,2])

colnames(res2$loadings)[which.max(res2$loadings[,1])]
## [1] "Comp.4"
res2$loadings[,1]
##        FL        RW        CL        CW        BD 
## 0.2889810 0.1972824 0.5993986 0.6616550 0.2837317
#PC1=0.29FL+0.2RW+0.6CL+0.66CW+0.28BD


compressed=res2$scores[,1:2]%*%t(res2$loadings[,1:2])+ 
  matrix(colMeans(crabs2), nrow=nrow(crabs2), ncol=ncol(crabs2), byrow=T)

Exercise 8

Use crab measurements from crab1 to run Lasso regression for with target \(CW\) and all other crab measurements as features and \(lambda=1\). Compute the amount of features selected and report the predictive equation.

A possible solution
crabs2=crabs1%>%select(FL:BD)

library(glmnet)
x=as.matrix(crabs2%>%select(-CW))
y=as.matrix(crabs2%>%select(CW))
mB=glmnet(x, y,family="gaussian", lambda = 1, alpha=1)
coef(mB)
## 5 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## (Intercept) 5.6110801
## FL          .        
## RW          .        
## CL          0.9594437
## BD          .

1 feature selected and predictive equation \(CW=5.61+0.96CL\)


Exercise 9

Use crab measurements from crab1 to run cross-validated Lasso regression with target \(CW\) and all other crab measurements as features. Report the optimal penalty.

A possible solution
crabs2=crabs1%>%select(FL:BD)

library(glmnet)
x=as.matrix(crabs2%>%select(-CW))
y=as.matrix(crabs2%>%select(CW))
model=cv.glmnet(x, y, alpha=1,family="gaussian")
model$lambda.min
## [1] 0.05140542

Exercise 10

Assuming \(MSE_{train}=(w-1)^2+(w-2)^2\) and \(MSE_{test}=(w-4)^2+(w+1)^2\), use BFGS optimization to plot the training and test MSE dependencies on iteration number. Is early stopping needed?

A possible solution
TestE=list()
TrainE=list()
k=0

mseTrain= function(w){
  MSE_train=(w-1)^2+(w-2)^2
  MSE_test=(w-4)^2+(w+1)^2
  .GlobalEnv$k= .GlobalEnv$k+1
  .GlobalEnv$TrainE[[k]]=MSE_train
  .GlobalEnv$TestE[[k]]=MSE_test
  return(MSE_train)
}

res=optim(c(0), fn=mseTrain,  method="BFGS")


plot(as.numeric(TrainE), type="l", col="blue", ylim=c(0,60), ylab="Error")
points(as.numeric(TestE), type="l", col="red")

Early stopping can be done at 8th iteration to get the smallest test error.