Adam Shelton
Names are webscraped from randomly generated names from behindthename.com
names = character()
for (i in 1:100) {
names_page = read_html("http://www.behindthename.com/random/random.php?number=2&sets=5&gender=both&surname=&randomsurname=yes&usage_afr=1&usage_chi=1&usage_eng=1&usage_fre=1&usage_ger=1&usage_jap=1&usage_spa=1")
names = c(names, names_page %>% html_nodes("center div") %>% html_text() %>% str_squish() %>% .[1:5])
}
drop_longer = Vectorize(function(char_vect, len) {
if (length(char_vect) > len) {
return(char_vect[1:len])
}
return(char_vect)
})
student_data = names %>% str_split(" ") %>% drop_longer(3) %>% unlist() %>% matrix(ncol = 3, byrow = TRUE) %>% as_tibble(.name_repair = ~ c("first", "middle", "last"))
num_to_letter_grade = function(x) {
if (x > 89) {
return("A")
}
if (x > 79) {
return("B")
}
if (x > 69) {
return("C")
}
if (x > 59) {
return("D")
}
return("F")
}
add_error = function(x, std_dev = 1) {
error = rnorm(length(x), 0, std_dev)
return(x + error)
}
rescale_grades = function(x) {
scale_coef = 100 / max(x)
return((x * scale_coef) - rnorm(1, 0.25))
}
get_last_school_year = function(x) {
cur_mon = as.Date(x) %>% format("%m") %>% as.numeric()
cur_year = as.Date(x) %>% format("%Y") %>% as.numeric()
if (cur_mon > 6) {
return(cur_year - 1)
}
return(cur_year - 2)
}
last_sch_yr = Sys.Date() %>% get_last_school_year()
years_back = 4
school_grade_adjustment = 7
student_data$grade_level = nrow(student_data) %>% runif(9, 12) %>% round()
student_data$age = (student_data$grade_level + 5.5 + rnorm(nrow(student_data))) %>% round()
student_data$school = nrow(student_data) %>% runif(1, 3) %>% round() %>% factor(levels = c(1,2,3), labels = c("Oakwood", "Shady Willow", "Pine Field"))
base_grades = nrow(student_data) %>% rnorm(72, 9) %>% add_error(0.5)
student_data$math_grade = base_grades %>% {ifelse(student_data$school == "Oakwood", . - rnorm(length(.), school_grade_adjustment), .)} %>% round()
student_data$english_grade = base_grades %>% add_error(8) %>% rescale_grades() %>% {ifelse(student_data$school == "Oakwood" | student_data$school == "Pine Field", . - rnorm(length(.), school_grade_adjustment), .)} %>% round()
student_data$science_grade = base_grades %>% add_error(7) %>% rescale_grades() %>% {ifelse(student_data$school == "Oakwood" | student_data$school == "Shady Willow", . -rnorm(length(.), school_grade_adjustment), .)} %>% round()
student_data$social_studies_grade = ((student_data$english_grade + student_data$science_grade) / 2) %>% add_error(6) %>% rescale_grades() %>% {ifelse(student_data$school == "Shady Willow", . + rnorm(length(.), school_grade_adjustment), .)} %>% round()
student_data %>% mutate(school = as.numeric(school)) %>% select(-first, -middle, -last) %>% cor() %>% ggcorrplot()
more_grades = function(year, data_set, min_year, year_grade_bump, grade_names = c("math_grade", "english_grade", "science_grade", "social_studies_grade")) {
grade_dif = max(data_set$grade_level) - data_set$grade_level
year_dif = year - min_year
for (n in grade_names) {
new_grades = data_set[[n]] %>% add_error(3) %>% (function(x) x + year_grade_bump ) %>% round()
missing_grades = rep(NA, length(new_grades))
data_set[[paste(n, year, sep = "__")]] = ifelse(grade_dif <= year_dif, new_grades, missing_grades) %>% {ifelse(. < 100, ., rep(100, length(.)) )}
}
return(data_set)
}
for (year in (last_sch_yr - years_back + 1):last_sch_yr) {
student_data = more_grades(year, student_data, (last_sch_yr - years_back + 1), runif(nrow(student_data), 0.15, 0.5))
}
student_data = student_data %>% select(-math_grade, -english_grade, -science_grade, -social_studies_grade)
plot_grid(student_data %>% ggplot(aes(x = school, y = math_grade__2018, color = school, fill = school)) + geom_violin() + theme(axis.text.x = element_blank()),
student_data %>% ggplot(aes(x = school, y = english_grade__2018, color = school, fill = school)) + geom_violin() + theme(axis.text.x = element_blank()),
student_data %>% ggplot(aes(x = school, y = science_grade__2018, color = school, fill = school)) + geom_violin() + theme(axis.text.x = element_blank()),
student_data %>% ggplot(aes(x = school, y = social_studies_grade__2018, color = school, fill = school)) + geom_violin() + theme(axis.text.x = element_blank()))
write_csv(student_data, here("student_data.csv"))