Try to solve the tasks first without looking at the answers, and if not finding a solution quickly then check the answers.
Create data frame birth by reading birthstatistics.csv into R and by using read.csv. Note that these data has headers.
birth = read.csv("birthstatistics.csv")
Create data frame blog by reading blogData_test.csv into R and by using read.csv. Note that headers (column names) are missing.
blog = read.csv("blogData_test.csv", header=FALSE)
Create data frame tecator by reading tecator.xls into R by using readxl package.
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
tecator = read_excel("tecator.xls")
Save tecator to tecator.csv with write.csv and make sure that row names are not saved
write.csv(tecator, file="tecator.csv", row.names = FALSE)
Convert tecator to a data frame and call it tecator1
tecator1 = as.data.frame(tecator)
Change row names in tecator1 to the values of Sample column plus 10
rownames(tecator1) = tecator1$Sample+10
Change column name in tecator1 from Sample to ID
colnames(tecator1)[1]="ID"
Extract rows in tecator1 such that Channel1 > 3 and Channel2 > 3 and columns between number 5 and number 8
print(tecator1[tecator1$Channel1>3 & tecator1$Channel2>3, 5:8])
## Channel4 Channel5 Channel6 Channel7
## 16 3.02634 3.03190 3.03756 3.04341
## 19 3.29300 3.29956 3.30627 3.31310
## 20 3.42001 3.42735 3.43479 3.44245
## 21 3.02469 3.03022 3.03601 3.04208
## 25 3.15899 3.16774 3.17642 3.18504
## 34 3.34985 3.35772 3.36558 3.37354
## 43 3.65758 3.66153 3.66578 3.67033
## 45 3.44046 3.44530 3.45056 3.45637
## 48 3.50489 3.51172 3.51873 3.52590
## 49 3.34351 3.35041 3.35752 3.36478
## 50 3.11264 3.11762 3.12276 3.12814
## 51 3.52075 3.52962 3.53865 3.54785
## 53 3.95693 3.96451 3.97251 3.98100
## 54 4.26773 4.27847 4.28968 4.30133
## 61 3.15899 3.16774 3.17642 3.18504
## 87 3.29528 3.30272 3.31020 3.31766
## 89 3.47360 3.48156 3.48952 3.49751
## 90 3.17813 3.18381 3.18961 3.19554
## 91 3.22318 3.22838 3.23368 3.23907
## 94 3.20930 3.21274 3.21636 3.22018
## 97 3.37403 3.38189 3.38971 3.39757
## 102 3.08597 3.09050 3.09514 3.09988
## 109 4.10717 4.11636 4.12572 4.13525
## 112 3.08688 3.09123 3.09573 3.10044
## 117 3.15323 3.15981 3.16644 3.17315
## 127 3.50489 3.51172 3.51873 3.52590
## 128 3.40334 3.40911 3.41516 3.42160
## 129 3.57747 3.58383 3.59043 3.59735
## 132 3.48186 3.48627 3.49097 3.49603
## 133 3.11264 3.11762 3.12276 3.12814
## 135 3.64674 3.65480 3.66311 3.67172
## 140 3.01383 3.01746 3.02126 3.02523
## 142 3.04335 3.04876 3.05436 3.06027
## 149 3.88595 3.89309 3.90047 3.90805
## 150 3.67866 3.68262 3.68689 3.69170
## 151 3.26786 3.27439 3.28116 3.28824
## 152 3.34351 3.35041 3.35752 3.36478
## 154 3.25922 3.26856 3.27784 3.28712
## 165 3.60163 3.61153 3.62135 3.63113
## 166 3.26700 3.27328 3.27958 3.28588
## 170 3.17258 3.17850 3.18450 3.19064
## 175 3.06896 3.07245 3.07616 3.08014
## 176 3.30482 3.31024 3.31584 3.32175
## 178 3.16061 3.16566 3.17092 3.17644
## 181 3.24026 3.24399 3.24807 3.25256
## 194 3.11395 3.11808 3.12238 3.12691
## 195 4.24588 4.25643 4.26727 4.27837
## 197 3.67057 3.68146 3.69229 3.70307
## 207 3.09688 3.09937 3.10200 3.10473
## 208 3.41130 3.41858 3.42588 3.43325
## 210 3.48244 3.49209 3.50170 3.51128
## 213 3.36992 3.37733 3.38473 3.39223
## 214 3.88595 3.89309 3.90047 3.90805
## 217 3.32423 3.33257 3.34094 3.34939
## 221 3.16244 3.16791 3.17348 3.17923
Remove column ID in tecator1
tecator1$ID=c()
Update tecator1 by dividing its all Channel columns with their respective means per column
library(stringr)
index=str_which(colnames(tecator1), "Channel")
tecatorChannel=tecator1[,index]
means=colMeans(tecatorChannel)
tecator1[,index]=tecator1[,index]/matrix(means, nrow=nrow(tecatorChannel), ncol=ncol(tecatorChannel), byrow=TRUE)
Compute a sum of squares for each row between 1 and 5 in tecator1 without writing loops and make it as a matrix with one column
sumsq=apply(tecator1[1:5,], MARGIN = 1, FUN=function(x) return(sum(x^2)) )
tecator2=matrix(sumsq, ncol=1)
Extract \(X\) as all columns except of columns 101-103 in tecator1, \(y\) as column Fat and compute \((X^T X)^{-1}X^T y\)
X=as.matrix(tecator1[,-c(101, 102, 103)]) #can be written more efficiently as -(101:103)
y=as.matrix(tecator1[,"Fat", drop=F]) #keep it as a matrix, don't reduce dimension.
result=solve(t(X)%*%X, t(X)%*%y)
Use column Channel1 in tecator1 to compute new column ChannelX which is a factor with the following levels: “high” if \(Channel1 > 1\) and “low” otherwise
tecator1$ChannelX=as.factor(ifelse(tecator1$Channel1>1, "high", "low"))
Write a for loop that computes regressions \(Fat\) as function of \(Channel_i, i=1,...100\) and then stores the intercepts into vector Intercepts. Print Intercepts.
Intercepts=numeric(100)
for (i in 1:length(Intercepts)){
regr=lm(formula=paste("Fat~Channel", i, sep=""), data=tecator1)
Intercepts[i]=coef(regr)[1]
}
print(Intercepts)
## [1] -14.01773 -13.76397 -13.52270 -13.29390 -13.08034 -12.88563 -12.71359
## [8] -12.56641 -12.44356 -12.34570 -12.27168 -12.22515 -12.21155 -12.23623
## [15] -12.30173 -12.40675 -12.54879 -12.72990 -12.94879 -13.19545 -13.45277
## [22] -13.69431 -13.89737 -14.05411 -14.17536 -14.28779 -14.43476 -14.65566
## [29] -14.97068 -15.37844 -15.85124 -16.34390 -16.81430 -17.24329 -17.64027
## [36] -18.03210 -18.43969 -18.84519 -19.20978 -19.48554 -19.62388 -19.58094
## [43] -19.32884 -18.86872 -18.24092 -17.52545 -16.82373 -16.24793 -15.85195
## [50] -15.65490 -15.64734 -15.79776 -16.05945 -16.37694 -16.69329 -16.96489
## [57] -17.16854 -17.30834 -17.39111 -17.44049 -17.47078 -17.49234 -17.50794
## [64] -17.51945 -17.52439 -17.52377 -17.51404 -17.50338 -17.49357 -17.49041
## [71] -17.49374 -17.50372 -17.52150 -17.54580 -17.57384 -17.60502 -17.63857
## [78] -17.68119 -17.73488 -17.80861 -17.89303 -17.97989 -18.05824 -18.11635
## [85] -18.14382 -18.13003 -18.08049 -18.01142 -17.94324 -17.89087 -17.85820
## [92] -17.83982 -17.82226 -17.79516 -17.75035 -17.68339 -17.58624 -17.45033
## [99] -17.27408 -17.06644
Given equation \(y=5x+1\), plot this dependence for x between 1 and 3
x=c(1,3)
y=5*x+1
plot(x,y, type="l", col="blue")
Convert data set birth to a tibble birth1
library(dplyr)
library(tidyr)
birth1=tibble(birth)
Select only columns X2002-X2020 from birth1 and save into birth2
birth2=birth1%>%select(X2002:X2020)
Create a new variable Status in birth1 that is equal to “Yes” if the record says “born in Sweden with two parents born in Sweden” and “No” otherwise
birth1=birth1%>%
mutate(Status=ifelse(foreign.Swedish.background=="born in Sweden with two parents born in Sweden",
"Yes", "No"))
Count the amount of rows in birth 1 corresponding to various combinations of sex and region
birth1%>%count(sex,region)
## # A tibble: 42 x 3
## sex region n
## <chr> <chr> <int>
## 1 boys 01 Stockholm county 4
## 2 boys 03 Uppsala county 4
## 3 boys 04 Södermanland county 4
## 4 boys 05 Östergötland county 4
## 5 boys 06 Jönköping county 4
## 6 boys 07 Kronoberg county 4
## 7 boys 08 Kalmar county 4
## 8 boys 09 Gotland county 4
## 9 boys 10 Blekinge county 4
## 10 boys 12 Skåne county 4
## # ... with 32 more rows
Assuming that X2002-X2020 in birth1 show amounts of persons born respective year, compute total amount of people born these years irrespective gender, given Status and region. Save the result into birth3
birth3=birth1%>%
select(-sex,- foreign.Swedish.background)%>%
group_by(region, Status)%>%
summarise_all(sum)%>%
ungroup()
birth3
## # A tibble: 42 x 21
## region Status X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009 X2010
## <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 01 Sto~ No 145978 147738 149411 151881 155983 160979 165751 171277 176658
## 2 01 Sto~ Yes 257239 259896 262351 264250 266025 266956 266740 267620 268601
## 3 03 Upp~ No 16505 16572 16654 16610 17236 17574 18053 18471 18792
## 4 03 Upp~ Yes 50785 50646 50684 50459 52617 52106 51607 51003 50522
## 5 04 Söd~ No 14107 14320 14311 14340 14443 14760 15414 15985 16248
## 6 04 Söd~ Yes 43185 43058 42744 42354 41920 41404 40628 39906 39377
## 7 05 Öst~ No 18068 18427 18662 18861 19453 20222 21103 21956 22499
## 8 05 Öst~ Yes 71677 71159 70484 69709 68717 67725 66286 65177 64130
## 9 06 Jön~ No 16451 16562 16611 16839 17271 17861 18308 18524 18687
## 10 06 Jön~ Yes 58973 58513 58020 57318 56399 55572 54255 53061 52074
## # ... with 32 more rows, and 10 more variables: X2011 <int>, X2012 <int>,
## # X2013 <int>, X2014 <int>, X2015 <int>, X2016 <int>, X2017 <int>,
## # X2018 <int>, X2019 <int>, X2020 <int>
By using birth3, compute percentage of people in 2002 having Status=Yes in different counties. Report a table with column region and Percentage sorted by Percentage.
birth4=birth3%>%
group_by(region)%>%
mutate(Percentage=X2002/sum(X2002)*100)%>%
filter(Status=="Yes")%>%
select(region, Percentage)%>%
ungroup()%>%
arrange(Percentage)
birth4
## # A tibble: 21 x 2
## region Percentage
## <chr> <dbl>
## 1 01 Stockholm county 63.8
## 2 12 Skåne county 70.9
## 3 19 Västmanland county 74.7
## 4 14 Västra Götaland county 74.9
## 5 04 Södermanland county 75.4
## 6 03 Uppsala county 75.5
## 7 18 Örebro county 77.8
## 8 06 Jönköping county 78.2
## 9 07 Kronoberg county 79.9
## 10 05 Östergötland county 79.9
## # ... with 11 more rows
By using birth1, transform the table to a long format: make sure that years are shown in column Year and values from the respective X2002-X2020 are stored in column Born. Make sure also that Year values show years as numerical values and store the table as birth5.
birth5 = birth1%>%
group_by(region, sex, foreign.Swedish.background, Status)%>%
pivot_longer(X2002:X2020, names_to="Year", values_to = "Born")%>%
mutate(Year=as.numeric(stringr::str_remove(Year, "X")))
birth5
## # A tibble: 3,192 x 6
## # Groups: region, sex, foreign.Swedish.background, Status [168]
## region sex foreign.Swedish.background Status Year Born
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 01 Stockholm county boys foreign born No 2002 13346
## 2 01 Stockholm county boys foreign born No 2003 13157
## 3 01 Stockholm county boys foreign born No 2004 12759
## 4 01 Stockholm county boys foreign born No 2005 12675
## 5 01 Stockholm county boys foreign born No 2006 13271
## 6 01 Stockholm county boys foreign born No 2007 14325
## 7 01 Stockholm county boys foreign born No 2008 15126
## 8 01 Stockholm county boys foreign born No 2009 16074
## 9 01 Stockholm county boys foreign born No 2010 16901
## 10 01 Stockholm county boys foreign born No 2011 18075
## # ... with 3,182 more rows
By using birth5, transform the table to wide format: make sure that years are shown as separate columns and their corresponding values are given by Born. Columns should be named as “Y_2002” for example.
birth6 = birth5%>%
group_by(region, sex, foreign.Swedish.background, Status)%>%
pivot_wider(names_from = Year, values_from = Born, names_prefix = "Y_")
birth6
## # A tibble: 168 x 23
## # Groups: region, sex, foreign.Swedish.background, Status [168]
## region sex foreign.Swedish~ Status Y_2002 Y_2003 Y_2004 Y_2005 Y_2006
## <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 01 Stockhol~ boys foreign born No 13346 13157 12759 12675 13271
## 2 01 Stockhol~ boys born in Sweden ~ No 31596 32185 32785 33456 34282
## 3 01 Stockhol~ boys born in Sweden ~ No 29815 30395 30976 31565 32102
## 4 01 Stockhol~ boys born in Sweden ~ Yes 131708 133236 134604 135502 136532
## 5 01 Stockhol~ girls foreign born No 12946 12732 12570 12570 13376
## 6 01 Stockhol~ girls born in Sweden ~ No 30157 30681 31328 31934 32687
## 7 01 Stockhol~ girls born in Sweden ~ No 28118 28588 28993 29681 30265
## 8 01 Stockhol~ girls born in Sweden ~ Yes 125531 126660 127747 128748 129493
## 9 03 Uppsala ~ boys foreign born No 1736 1694 1614 1565 1659
## 10 03 Uppsala ~ boys born in Sweden ~ No 2772 2834 2882 2883 2981
## # ... with 158 more rows, and 14 more variables: Y_2007 <int>, Y_2008 <int>,
## # Y_2009 <int>, Y_2010 <int>, Y_2011 <int>, Y_2012 <int>, Y_2013 <int>,
## # Y_2014 <int>, Y_2015 <int>, Y_2016 <int>, Y_2017 <int>, Y_2018 <int>,
## # Y_2019 <int>, Y_2020 <int>
By using blog data, filter out columns that have zeroes everywhere.
blogS=tibble(blog)%>%select_if(function(x) !all(x==0))
blogS
## # A tibble: 206 x 154
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2.27 4.98 0 44 1 1.03 3.00 0 37 0 0.859 2.98
## 2 3.72 7.94 0 43 0 1.37 4.51 0 41 0 1.31 4.48
## 3 3.72 7.94 0 43 0 1.37 4.51 0 41 0 1.31 4.48
## 4 123. 110. 0 1069 89 44.9 74.5 0 1046 12 42.8 74.7
## 5 43.4 75.6 0 634 20 16.0 44.6 0 473 2 15.5 44.7
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## 7 3.72 7.94 0 43 0 1.37 4.51 0 41 0 1.31 4.48
## 8 0 0 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0 0 0
## 10 43.4 75.6 0 634 20 16.0 44.6 0 473 2 15.5 44.7
## # ... with 196 more rows, and 142 more variables: V14 <dbl>, V15 <dbl>,
## # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V34 <dbl>,
## # V35 <dbl>, V36 <dbl>, V37 <dbl>, V39 <dbl>, V41 <dbl>, V42 <dbl>,
## # V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## # V49 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>, V55 <dbl>, ...
Sample and print random 5 rows from birth data without replacement, use seed 123.
set.seed(123)
smp=birth[sample(nrow(birth), 5, replace=FALSE), ]
print(smp)
## region sex
## 159 24 Västerbotten county girls
## 14 03 Uppsala county girls
## 50 08 Kalmar county boys
## 118 19 Västmanland county girls
## 43 07 Kronoberg county boys
## foreign.Swedish.background
## 159 born in Sweden with one parent born in Sweden and one foreign born parent
## 14 born in Sweden with two foreign born parents
## 50 born in Sweden with two foreign born parents
## 118 born in Sweden with two foreign born parents
## 43 born in Sweden with one parent born in Sweden and one foreign born parent
## X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009 X2010 X2011 X2012 X2013
## 159 1740 1769 1808 1826 1880 1958 1979 2030 2061 2137 2177 2215
## 14 2615 2687 2740 2755 2807 2841 2894 2980 3066 3179 3313 3418
## 50 953 1033 1067 1126 1181 1233 1276 1333 1423 1471 1504 1518
## 118 2138 2196 2252 2321 2360 2431 2534 2621 2745 2862 3016 3206
## 43 1727 1755 1790 1795 1804 1817 1851 1825 1828 1820 1862 1894
## X2014 X2015 X2016 X2017 X2018 X2019 X2020
## 159 2265 2303 2410 2466 2525 2575 2627
## 14 3550 3830 4112 4498 4807 5234 5592
## 50 1603 1713 1900 2125 2314 2513 2661
## 118 3464 3684 3950 4254 4625 4973 5300
## 43 1938 1963 2043 2092 2154 2193 2258
Generate a vector with 5 elements from a Normal distribution with mean 5 and standard deviation 0.1
rnorm(5, mean=5, sd=0.1)
## [1] 4.989103 4.988276 5.018308 5.128055 4.827273
Generate a vector with 168 elements from a Normal distribution with means equal to X2002 values in birth data and standard deviations 0.1
rnorm(168, mean=birth$X2002, sd=0.1)
## [1] 13346.16902 31596.05038 29815.25283 131708.05491 12946.02382
## [6] 30156.89511 28118.12948 125531.08255 1735.99443 2771.92156
## [11] 3849.92665 26261.97841 1737.96651 2614.89143 3793.99146
## [16] 24523.10706 1609.98546 2241.88345 3359.91815 22176.06849
## [21] 1509.96799 2095.86885 3288.94004 21008.98706 2280.08867
## [26] 3319.98486 3683.03298 36963.67727 2183.92282 3096.02865
## [31] 3504.87795 34713.04346 2245.08002 3134.98361 2959.12429
## [36] 30280.90656 2075.03937 3002.04036 3034.91136 28691.86811
## [41] 1062.00288 1105.95679 1727.16899 15809.12284 997.02760
## [46] 1091.89510 1695.94791 14654.16232 1196.89299 953.16859
## [51] 1618.97583 21814.95318 1071.92270 947.21499 1576.86656
## [56] 20721.04959 153.12340 77.06344 376.04120 5891.07936
## [61] 150.98476 64.97711 368.90992 5634.92650 698.85723
## [66] 625.06193 1241.99938 13296.93143 658.97207 563.92173
## [71] 1197.92210 12545.96252 8893.96806 14296.00845 13320.92315
## [76] 89180.93741 8682.90991 13574.06637 12774.03003 84864.00749
## [81] 1601.02064 1701.95111 2993.93720 26886.99531 1464.01626
## [86] 1729.12923 2775.95364 25331.03055 9708.99160 15545.04104
## [91] 17416.01837 127121.17787 9159.00377 14616.11762 16513.94415
## [96] 120073.90544 1195.93348 825.04520 2371.05269 24900.97697
## [101] 1124.13974 795.17637 2327.04856 23620.97343 1713.01516
## [106] 2262.13766 2718.98196 23569.84323 1674.97393 2180.09618
## [111] 2554.08539 22414.04188 1630.03400 2200.05964 3480.18714
## [116] 21522.06029 1547.92338 2137.93797 3319.07902 20658.97580
## [121] 931.11175 871.11849 2396.16465 26506.01930 930.96074
## [126] 774.00071 2358.75052 24841.90227 1113.06286 990.99159
## [131] 2093.03821 25492.89066 968.92445 920.97360 1975.92475
## [136] 24126.04407 828.87225 574.11772 1713.09025 22643.87387
## [141] 779.08375 535.76517 1658.06110 21393.99521 375.76008
## [146] 129.99807 906.99113 12419.84045 319.08517 124.92864
## [151] 846.10664 11753.94638 1017.05359 717.81714 1893.81865
## [156] 24825.13726 980.94357 711.09703 1739.99814 23577.03624
## [161] 844.20113 797.88220 2843.92448 23143.96687 845.97163
## [166] 745.03142 2642.18448 21760.90181
Compute probability density values of standard normal distribution in points x=-1, 0, 1.
dnorm(c(-1,0,1))
## [1] 0.2419707 0.3989423 0.2419707
Assuming in birth data \(X2003=w\cdot X2002 +w_0+\epsilon\) where \(\epsilon \sim Exponential(1)\), write down a minus log-likelihood formula for this model as a function of parameters \(w\) and \(w_0\) and implement an R function for the minus log-likelihood
From the formula, we get \(X2003-w\cdot X2002 -w_0=\epsilon\); it means that for each observation \(X2003-w\cdot X2002 -w_0 \sim Exponential(1)\). The pdf of this exponential distribution is \(p(\epsilon)=e^{-\epsilon}\) so \(p(X2003|X2002,w,w_0)=exp^{-(X2003-w\cdot X2002-w_0)}\) and therefore log-probability is \(log p(X2003|X2002,w,w_0)=-(X2003-w\cdot X2002-w_0)\). The minus log-likelihood is the minus sum of log-probability over all observations.
loglik = function(w, w0){
Probs=-(birth$X2003-w*birth$X2002-w0)
return (-sum(Probs))
}
#testing
loglik(1,1)
## [1] 358
Compute a linear regression model with target X2020 and features X2002-X2004 by using data birth. Report the regression coefficients and the MSE and probabilistic model.
df=birth%>%select(X2020, X2002:X2004)
m1=lm(X2020~., df)
coef(m1)
## (Intercept) X2002 X2003 X2004
## 1872.513918 -1.338202 -10.354357 12.672786
Preds=predict(m1)
MSE=mean((df$X2020-Preds)^2)
MSE
## [1] 9568343
Probabilistic model \(X2020 \sim N(1872-1.33X2002-10.35X2003+12.67X2004, 9568343)\)
Use package fastDummy to create dummy variables for variables region and sex in birth data
library(fastDummies)
dummies = dummy_cols(birth, select_columns = c("region", "sex"))
Use rows 1-50 of birth data with columns X2002-X2020 as training data and rows 51-100 of the same data and test data. Scale training and test data appropriately with caret package
library(caret)
train=birth%>%select(X2002:X2020)%>%slice(1:50)
test=birth%>%select(X2002:X2020)%>%slice(51:100)
params=preProcess(train)
trainS=predict(params, train)
testS=predict(params,test)
Compute logistic regression by using birth data and sex as target and X2002-X2004 as features. Assuming probability threshold 0.5, compute and print the confusion matrix. Report probabilistic model.
train=birth%>%select(X2002:X2004, sex)
m1=glm(as.factor(sex)~., train, family = "binomial")
Prob=predict(m1, type="response")
Pred=ifelse(Prob>0.5, "girls", "boys")
table(train$sex, Pred)
## Pred
## boys girls
## boys 34 50
## girls 32 52
summary(m1)
##
## Call:
## glm(formula = as.factor(sex) ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.26906 -1.18242 0.05076 1.17339 1.24122
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0172008 0.1744414 0.099 0.921
## X2002 0.0009101 0.0026498 0.343 0.731
## X2003 -0.0018329 0.0050929 -0.360 0.719
## X2004 0.0009227 0.0024851 0.371 0.710
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 232.90 on 167 degrees of freedom
## Residual deviance: 232.72 on 164 degrees of freedom
## AIC: 240.72
##
## Number of Fisher Scoring iterations: 3
Probabilistic model \(P(sex=girls)=\frac{1}{1+exp^{-0.0172+0.0009X2002-0.0018X2003+ 0.0009X2004}}\)
Compute k nearest neighbor classification with k=10 by using birth data and sex as target and X2002-X2010 as features. Print the predicted class labels for the training data.
train=birth%>%select(X2002:X2010, sex)
library(kknn)
m1=kknn(as.factor(sex)~., train,train, k=10,kernel="rectangular")
Pred=m1$fitted.values
Pred
## [1] girls boys boys boys girls boys boys boys boys boys boys boys
## [13] boys boys boys boys girls girls girls girls boys girls boys boys
## [25] girls boys boys boys girls boys girls boys girls boys girls girls
## [37] boys boys girls boys girls boys boys girls girls boys girls boys
## [49] boys boys girls girls girls boys girls girls girls girls girls boys
## [61] girls girls girls boys boys girls boys girls boys boys boys girls
## [73] boys boys girls boys boys boys boys girls girls girls girls boys
## [85] girls boys boys boys boys girls girls boys girls boys girls boys
## [97] boys boys girls boys boys girls girls girls boys boys boys boys
## [109] boys girls girls girls girls girls boys girls girls girls boys girls
## [121] boys girls girls boys girls girls girls boys boys boys girls boys
## [133] boys boys boys boys boys boys boys girls girls boys girls boys
## [145] girls girls boys girls girls girls girls girls boys boys boys boys
## [157] girls boys boys girls girls girls girls girls girls girls boys girls
## Levels: boys girls
Compute the optimal parameter values and the optimal function value for \(\min_{x_1, x_2} (x_1-1)^2+(x_2-2)^2\).
df=data.frame(x1=1,x2=2)
to_optimize<-function(x){
x1=x[1]
x2=x[2]
return((x1-df$x1)^2+(x2-df$x2)^2)
}
res=optim(c(0,0), fn=to_optimize, method="BFGS")
res$par
## [1] 1 2
res$value
## [1] 6.357135e-26