Try to solve the tasks first without looking at the answers, and if not finding a solution quickly then check the answers.
Create data frame birth by reading birthstatistics.csv into R and by using read.csv. Note that these data has headers.
birth = read.csv("birthstatistics.csv")
Create data frame blog by reading blogData_test.csv into R and by using read.csv. Note that headers (column names) are missing.
blog = read.csv("blogData_test.csv", header=FALSE)
Create data frame tecator by reading tecator.xls into R by using readxl package.
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
tecator = read_excel("tecator.xls")
Save tecator to tecator.csv with write.csv and make sure that row names are not saved
write.csv(tecator, file="tecator.csv", row.names = FALSE)
Convert tecator to a data frame and call it tecator1
tecator1 = as.data.frame(tecator)
Change row names in tecator1 to the values of Sample column plus 10
rownames(tecator1) = tecator1$Sample+10
Change column name in tecator1 from Sample to ID
colnames(tecator1)[1]="ID"
Extract rows in tecator1 such that Channel1 > 3 and Channel2 > 3 and columns between number 5 and number 8
print(tecator1[tecator1$Channel1>3 & tecator1$Channel2>3, 5:8])
## Channel4 Channel5 Channel6 Channel7
## 16 3.02634 3.03190 3.03756 3.04341
## 19 3.29300 3.29956 3.30627 3.31310
## 20 3.42001 3.42735 3.43479 3.44245
## 21 3.02469 3.03022 3.03601 3.04208
## 25 3.15899 3.16774 3.17642 3.18504
## 34 3.34985 3.35772 3.36558 3.37354
## 43 3.65758 3.66153 3.66578 3.67033
## 45 3.44046 3.44530 3.45056 3.45637
## 48 3.50489 3.51172 3.51873 3.52590
## 49 3.34351 3.35041 3.35752 3.36478
## 50 3.11264 3.11762 3.12276 3.12814
## 51 3.52075 3.52962 3.53865 3.54785
## 53 3.95693 3.96451 3.97251 3.98100
## 54 4.26773 4.27847 4.28968 4.30133
## 61 3.15899 3.16774 3.17642 3.18504
## 87 3.29528 3.30272 3.31020 3.31766
## 89 3.47360 3.48156 3.48952 3.49751
## 90 3.17813 3.18381 3.18961 3.19554
## 91 3.22318 3.22838 3.23368 3.23907
## 94 3.20930 3.21274 3.21636 3.22018
## 97 3.37403 3.38189 3.38971 3.39757
## 102 3.08597 3.09050 3.09514 3.09988
## 109 4.10717 4.11636 4.12572 4.13525
## 112 3.08688 3.09123 3.09573 3.10044
## 117 3.15323 3.15981 3.16644 3.17315
## 127 3.50489 3.51172 3.51873 3.52590
## 128 3.40334 3.40911 3.41516 3.42160
## 129 3.57747 3.58383 3.59043 3.59735
## 132 3.48186 3.48627 3.49097 3.49603
## 133 3.11264 3.11762 3.12276 3.12814
## 135 3.64674 3.65480 3.66311 3.67172
## 140 3.01383 3.01746 3.02126 3.02523
## 142 3.04335 3.04876 3.05436 3.06027
## 149 3.88595 3.89309 3.90047 3.90805
## 150 3.67866 3.68262 3.68689 3.69170
## 151 3.26786 3.27439 3.28116 3.28824
## 152 3.34351 3.35041 3.35752 3.36478
## 154 3.25922 3.26856 3.27784 3.28712
## 165 3.60163 3.61153 3.62135 3.63113
## 166 3.26700 3.27328 3.27958 3.28588
## 170 3.17258 3.17850 3.18450 3.19064
## 175 3.06896 3.07245 3.07616 3.08014
## 176 3.30482 3.31024 3.31584 3.32175
## 178 3.16061 3.16566 3.17092 3.17644
## 181 3.24026 3.24399 3.24807 3.25256
## 194 3.11395 3.11808 3.12238 3.12691
## 195 4.24588 4.25643 4.26727 4.27837
## 197 3.67057 3.68146 3.69229 3.70307
## 207 3.09688 3.09937 3.10200 3.10473
## 208 3.41130 3.41858 3.42588 3.43325
## 210 3.48244 3.49209 3.50170 3.51128
## 213 3.36992 3.37733 3.38473 3.39223
## 214 3.88595 3.89309 3.90047 3.90805
## 217 3.32423 3.33257 3.34094 3.34939
## 221 3.16244 3.16791 3.17348 3.17923
Remove column ID in tecator1
tecator1$ID=c()
Update tecator1 by dividing its all Channel columns with their respective means per column
library(stringr)
index=str_which(colnames(tecator1), "Channel")
tecatorChannel=tecator1[,index]
means=colMeans(tecatorChannel)
tecator1[,index]=tecator1[,index]/matrix(means, nrow=nrow(tecatorChannel), ncol=ncol(tecatorChannel), byrow=TRUE)
Compute a sum of squares for each row between 1 and 5 in tecator1 without writing loops and make it as a matrix with one column
sumsq=apply(tecator1[1:5,], MARGIN = 1, FUN=function(x) return(sum(x^2)) )
tecator2=matrix(sumsq, ncol=1)
Extract \(X\) as all columns except of columns 101-103 in tecator1, \(y\) as column Fat and compute \((X^T X)^{-1}X^T y\)
X=as.matrix(tecator1[,-c(101, 102, 103)]) #can be written more efficiently as -(101:103)
y=as.matrix(tecator1[,"Fat", drop=F]) #keep it as a matrix, don't reduce dimension.
result=solve(t(X)%*%X, t(X)%*%y)
Use column Channel1 in tecator1 to compute new column ChannelX which is a factor with the following levels: “high” if \(Channel1 > 1\) and “low” otherwise
tecator1$ChannelX=as.factor(ifelse(tecator1$Channel1>1, "high", "low"))
Write a for loop that computes regressions \(Fat\) as function of \(Channel_i, i=1,...100\) and then stores the intercepts into vector Intercepts. Print Intercepts.
Intercepts=numeric(100)
for (i in 1:length(Intercepts)){
regr=lm(formula=paste("Fat~Channel", i, sep=""), data=tecator1)
Intercepts[i]=coef(regr)[1]
}
print(Intercepts)
## [1] -14.01773 -13.76397 -13.52270 -13.29390 -13.08034 -12.88563 -12.71359
## [8] -12.56641 -12.44356 -12.34570 -12.27168 -12.22515 -12.21155 -12.23623
## [15] -12.30173 -12.40675 -12.54879 -12.72990 -12.94879 -13.19545 -13.45277
## [22] -13.69431 -13.89737 -14.05411 -14.17536 -14.28779 -14.43476 -14.65566
## [29] -14.97068 -15.37844 -15.85124 -16.34390 -16.81430 -17.24329 -17.64027
## [36] -18.03210 -18.43969 -18.84519 -19.20978 -19.48554 -19.62388 -19.58094
## [43] -19.32884 -18.86872 -18.24092 -17.52545 -16.82373 -16.24793 -15.85195
## [50] -15.65490 -15.64734 -15.79776 -16.05945 -16.37694 -16.69329 -16.96489
## [57] -17.16854 -17.30834 -17.39111 -17.44049 -17.47078 -17.49234 -17.50794
## [64] -17.51945 -17.52439 -17.52377 -17.51404 -17.50338 -17.49357 -17.49041
## [71] -17.49374 -17.50372 -17.52150 -17.54580 -17.57384 -17.60502 -17.63857
## [78] -17.68119 -17.73488 -17.80861 -17.89303 -17.97989 -18.05824 -18.11635
## [85] -18.14382 -18.13003 -18.08049 -18.01142 -17.94324 -17.89087 -17.85820
## [92] -17.83982 -17.82226 -17.79516 -17.75035 -17.68339 -17.58624 -17.45033
## [99] -17.27408 -17.06644
Given equation \(y=5x+1\), plot this dependence for x between 1 and 3
x=c(1,3)
y=5*x+1
plot(x,y, type="l", col="blue")
Convert data set birth to a tibble birth1
library(dplyr)
library(tidyr)
birth1=tibble(birth)
Select only columns X2002-X2020 from birth1 and save into birth2
birth2=birth1%>%select(X2002:X2020)
Create a new variable Status in birth1 that is equal to “Yes” if the record says “born in Sweden with two parents born in Sweden” and “No” otherwise
birth1=birth1%>%
mutate(Status=ifelse(foreign.Swedish.background=="born in Sweden with two parents born in Sweden",
"Yes", "No"))
Count the amount of rows in birth 1 corresponding to various combinations of sex and region
birth1%>%count(sex,region)
## # A tibble: 42 x 3
## sex region n
## <chr> <chr> <int>
## 1 boys 01 Stockholm county 4
## 2 boys 03 Uppsala county 4
## 3 boys 04 Södermanland county 4
## 4 boys 05 Östergötland county 4
## 5 boys 06 Jönköping county 4
## 6 boys 07 Kronoberg county 4
## 7 boys 08 Kalmar county 4
## 8 boys 09 Gotland county 4
## 9 boys 10 Blekinge county 4
## 10 boys 12 Skåne county 4
## # ... with 32 more rows
Assuming that X2002-X2020 in birth1 show amounts of persons born respective year, compute total amount of people born these years irrespective gender, given Status and region. Save the result into birth3
birth3=birth1%>%
select(-sex,- foreign.Swedish.background)%>%
group_by(region, Status)%>%
summarise_all(sum)%>%
ungroup()
birth3
## # A tibble: 42 x 21
## region Status X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009 X2010
## <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 01 Sto~ No 145978 147738 149411 151881 155983 160979 165751 171277 176658
## 2 01 Sto~ Yes 257239 259896 262351 264250 266025 266956 266740 267620 268601
## 3 03 Upp~ No 16505 16572 16654 16610 17236 17574 18053 18471 18792
## 4 03 Upp~ Yes 50785 50646 50684 50459 52617 52106 51607 51003 50522
## 5 04 Söd~ No 14107 14320 14311 14340 14443 14760 15414 15985 16248
## 6 04 Söd~ Yes 43185 43058 42744 42354 41920 41404 40628 39906 39377
## 7 05 Öst~ No 18068 18427 18662 18861 19453 20222 21103 21956 22499
## 8 05 Öst~ Yes 71677 71159 70484 69709 68717 67725 66286 65177 64130
## 9 06 Jön~ No 16451 16562 16611 16839 17271 17861 18308 18524 18687
## 10 06 Jön~ Yes 58973 58513 58020 57318 56399 55572 54255 53061 52074
## # ... with 32 more rows, and 10 more variables: X2011 <int>, X2012 <int>,
## # X2013 <int>, X2014 <int>, X2015 <int>, X2016 <int>, X2017 <int>,
## # X2018 <int>, X2019 <int>, X2020 <int>
By using birth3, compute percentage of people in 2002 having Status=Yes in different counties. Report a table with column region and Percentage sorted by Percentage.
birth4=birth3%>%
group_by(region)%>%
mutate(Percentage=X2002/sum(X2002)*100)%>%
filter(Status=="Yes")%>%
select(region, Percentage)%>%
ungroup()%>%
arrange(Percentage)
birth4
## # A tibble: 21 x 2
## region Percentage
## <chr> <dbl>
## 1 01 Stockholm county 63.8
## 2 12 Skåne county 70.9
## 3 19 Västmanland county 74.7
## 4 14 Västra Götaland county 74.9
## 5 04 Södermanland county 75.4
## 6 03 Uppsala county 75.5
## 7 18 Örebro county 77.8
## 8 06 Jönköping county 78.2
## 9 07 Kronoberg county 79.9
## 10 05 Östergötland county 79.9
## # ... with 11 more rows
By using birth1, transform the table to a long format: make sure that years are shown in column Year and values from the respective X2002-X2020 are stored in column Born. Make sure also that Year values show years as numerical values and store the table as birth5.
birth5 = birth1%>%
group_by(region, sex, foreign.Swedish.background, Status)%>%
pivot_longer(X2002:X2020, names_to="Year", values_to = "Born")%>%
mutate(Year=as.numeric(stringr::str_remove(Year, "X")))
birth5
## # A tibble: 3,192 x 6
## # Groups: region, sex, foreign.Swedish.background, Status [168]
## region sex foreign.Swedish.background Status Year Born
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 01 Stockholm county boys foreign born No 2002 13346
## 2 01 Stockholm county boys foreign born No 2003 13157
## 3 01 Stockholm county boys foreign born No 2004 12759
## 4 01 Stockholm county boys foreign born No 2005 12675
## 5 01 Stockholm county boys foreign born No 2006 13271
## 6 01 Stockholm county boys foreign born No 2007 14325
## 7 01 Stockholm county boys foreign born No 2008 15126
## 8 01 Stockholm county boys foreign born No 2009 16074
## 9 01 Stockholm county boys foreign born No 2010 16901
## 10 01 Stockholm county boys foreign born No 2011 18075
## # ... with 3,182 more rows
By using birth5, transform the table to wide format: make sure that years are shown as separate columns and their corresponding values are given by Born. Columns should be named as “Y_2002” for example.
birth6 = birth5%>%
group_by(region, sex, foreign.Swedish.background, Status)%>%
pivot_wider(names_from = Year, values_from = Born, names_prefix = "Y_")
birth6
## # A tibble: 168 x 23
## # Groups: region, sex, foreign.Swedish.background, Status [168]
## region sex foreign.Swedish~ Status Y_2002 Y_2003 Y_2004 Y_2005 Y_2006
## <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 01 Stockhol~ boys foreign born No 13346 13157 12759 12675 13271
## 2 01 Stockhol~ boys born in Sweden ~ No 31596 32185 32785 33456 34282
## 3 01 Stockhol~ boys born in Sweden ~ No 29815 30395 30976 31565 32102
## 4 01 Stockhol~ boys born in Sweden ~ Yes 131708 133236 134604 135502 136532
## 5 01 Stockhol~ girls foreign born No 12946 12732 12570 12570 13376
## 6 01 Stockhol~ girls born in Sweden ~ No 30157 30681 31328 31934 32687
## 7 01 Stockhol~ girls born in Sweden ~ No 28118 28588 28993 29681 30265
## 8 01 Stockhol~ girls born in Sweden ~ Yes 125531 126660 127747 128748 129493
## 9 03 Uppsala ~ boys foreign born No 1736 1694 1614 1565 1659
## 10 03 Uppsala ~ boys born in Sweden ~ No 2772 2834 2882 2883 2981
## # ... with 158 more rows, and 14 more variables: Y_2007 <int>, Y_2008 <int>,
## # Y_2009 <int>, Y_2010 <int>, Y_2011 <int>, Y_2012 <int>, Y_2013 <int>,
## # Y_2014 <int>, Y_2015 <int>, Y_2016 <int>, Y_2017 <int>, Y_2018 <int>,
## # Y_2019 <int>, Y_2020 <int>
By using blog data, filter out columns that have zeroes everywhere.
blogS=tibble(blog)%>%select_if(function(x) !all(x==0))
blogS
## # A tibble: 206 x 154
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2.27 4.98 0 44 1 1.03 3.00 0 37 0 0.859 2.98
## 2 3.72 7.94 0 43 0 1.37 4.51 0 41 0 1.31 4.48
## 3 3.72 7.94 0 43 0 1.37 4.51 0 41 0 1.31 4.48
## 4 123. 110. 0 1069 89 44.9 74.5 0 1046 12 42.8 74.7
## 5 43.4 75.6 0 634 20 16.0 44.6 0 473 2 15.5 44.7
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## 7 3.72 7.94 0 43 0 1.37 4.51 0 41 0 1.31 4.48
## 8 0 0 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0 0 0
## 10 43.4 75.6 0 634 20 16.0 44.6 0 473 2 15.5 44.7
## # ... with 196 more rows, and 142 more variables: V14 <dbl>, V15 <dbl>,
## # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V34 <dbl>,
## # V35 <dbl>, V36 <dbl>, V37 <dbl>, V39 <dbl>, V41 <dbl>, V42 <dbl>,
## # V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## # V49 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>, V55 <dbl>, ...