# Install the 'readxl' package
#install.packages("readxl")
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
#Hold link to data hosted on github
git_link = "https://github.com/Funtamendals-Charles/Homework5Exercise2/raw/refs/heads/main/Data/HDR25_Statistical_Annex_HDI_Table.xlsx"
#Create a temp file that will hold the xlsx data
temp = tempfile(fileext = ".xlsx")
#Download the file from the url and store it in the temp file location. Make sure it is in "wb" mode (write and binary)
download.file(git_link, temp, mode="wb")
#Read the data from the temp location and load it into a data frame.
original_data = read_excel(temp)
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
#Transformation 1: Slice towards where the relevant data begins
transformed_data = original_data[4:204,2:13]
#Transformation 2: Selected only needed columns
transformed_data2 = transformed_data[,c(1,2,4,6,8,10,12)]
#Transformation 3: Realign header names by moving "Country" to the same row as the other header names (currently in row 1).
#Copy the variable holding the table
transformed_data3 = transformed_data2
#Set the cell at (1,1) to "Country"
transformed_data3[1,1] = "Country"
#Transformation 4: Promote row1 as the header row
#Copy the variable holding the table
transformed_data4 = transformed_data3
#Rename the column headers of the table to the values in the first row
colnames(transformed_data4) = transformed_data3[1,]
#Transformation 5: Remove first four rows which holds unneeded metadata.
transformed_data5 = transformed_data4[-c(1,2,3,4),]
#Transformation 6: Remove metadata rows about relative human development groups
transformed_data6 = transformed_data5[-c(75,126,170,197),]
#Transformation 7: Set the correct data types for each column
#Copy the variable holding the table
transformed_data7 = transformed_data6
#Change the appropriate columns to numeric. Apply the as.numeric function to each appropriate column and then save it to those columns.
transformed_data7[,c(2,3,4,5,6,7)] = lapply(transformed_data7[,c(2,3,4,5,6,7)], as.numeric)
#Finalize changes
Clean_data = transformed_data7
#View cleaned data
str(Clean_data)
## tibble [193 × 7] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:193] "Iceland" "Norway" "Switzerland" "Denmark" ...
## $ Human Development Index (HDI) : num [1:193] 0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
## $ Life expectancy at birth : num [1:193] 82.7 83.3 84 81.9 81.4 ...
## $ Expected years of schooling : num [1:193] 18.9 18.8 16.7 18.7 17.3 ...
## $ Mean years of schooling : num [1:193] 13.9 13.1 13.9 13 14.3 ...
## $ Gross national income (GNI) per capita: num [1:193] 69117 112710 81949 76008 64053 ...
## $ GNI per capita rank minus HDI rank : num [1:193] 12 0 5 4 13 10 14 4 6 9 ...
#Compute means:
#Apply the mean function to each appropriate column.
lapply(Clean_data[,c(3,4,5,6)], mean)
## $`Life expectancy at birth`
## [1] 73.11152
##
## $`Expected years of schooling`
## [1] 13.58476
##
## $`Mean years of schooling`
## [1] 9.173346
##
## $`Gross national income (GNI) per capita`
## [1] 24620.68