r-crash-course

About the Course

Introduction

What is R?

What is so great about R?

Pros Cons
———————– —————————– Open Source Provides a specific tool-set Highly reproducible Not always intuitive
Free to use Variance in quality
Rich package ecosystem Performance can be slower

Who is R for? (and where do I get it?)

Additional Resources

Getting Started

R Studio’s User Interface

R Studio Default Screen

Upper Left Pane - Source

Upper Left Pane

Lower Left Pane - Command Line

Lower Left Pane

Upper Right Pane - Environment et. al.

Upper Right Pane

Lower Right Pane - Everything Else

Lower Right Pane

Your First R Script

## [1] "Hello world"

Variable Assignment

# Create a variable for your favorite number
fav_num = 7
fav.num <- 7

# ...or your favorite color
fav_color = "blue"
fav_color
## [1] "blue"
# Save a vector (a group) of numbers
evens = c(2, 4, 6, 8, 10)

More Things to Try

# Arithmetic
(2 + 2) * 4^2
## [1] 64
# Use a function
mean(evens)
## [1] 6
# Use variables
fav_num + 3
## [1] 10
fav_num + evens
## [1]  9 11 13 15 17

What did we learn?

Data Structures

Vector Basics

Working with Vectors

c(c(1,2), c(3,4))
## [1] 1 2 3 4
pets = c("dog", "fish", "cat", "frog", "hamster")
first_pet = pets[1]
first_pet
## [1] "dog"
mammals = pets[c(1,3,5)]
mammals
## [1] "dog"     "cat"     "hamster"

Converting Vectors

as.logical(c(1, 0, 0, 1))
## [1]  TRUE FALSE FALSE  TRUE
as.logical(c("TRUE", "FALSE", "F", "T"))
## [1]  TRUE FALSE FALSE  TRUE
as.numeric(c(TRUE, FALSE, F, T))
## [1] 1 0 0 1
as.numeric(c("1", "0", "0", "1"))
## [1] 1 0 0 1
as.character(c(TRUE, FALSE, F, T))
## [1] "TRUE"  "FALSE" "FALSE" "TRUE"
as.character(c(1, 0, 0, 1))
## [1] "1" "0" "0" "1"
sum(c(T, F, T, F))
## [1] 2

Sequences

1:5
## [1] 1 2 3 4 5
seq(1, 5)
## [1] 1 2 3 4 5
seq(1, 10, by = 3)
## [1]  1  4  7 10

Manipulating Character Vectors

paste("hello", "there", "friends")
## [1] "hello there friends"
paste("hello", "there", "friends", sep = "_")
## [1] "hello_there_friends"
str_sub("grades_2020", start = 8)
## [1] "2020"
str_split("grades_2020", pattern = "_")
## [[1]]
## [1] "grades" "2020"

Making Logical Vectors

1:5 <= 3
## [1]  TRUE  TRUE  TRUE FALSE FALSE
1:5 == 4
## [1] FALSE FALSE FALSE  TRUE FALSE
c("hello", "there", "friend") == "there"
## [1] FALSE  TRUE FALSE
"hello" %in% c("hello", "there", "friend")
## [1] TRUE

Logical Operators

c(TRUE & TRUE, TRUE & FALSE)
## [1]  TRUE FALSE
c(TRUE | TRUE, TRUE | FALSE)
## [1] TRUE TRUE
c(!TRUE, !FALSE)
## [1] FALSE  TRUE
rep(TRUE, 5) & rep(TRUE, 5)
## [1] TRUE TRUE TRUE TRUE TRUE

Finding the Type of a Vector

typeof("hello")
## [1] "character"
typeof(6)
## [1] "double"
typeof(TRUE)
## [1] "logical"

A Vector Inside a Vector

Tibbles

Using a Tibble

Trying a Tibble

Trying a Tibble | Answers

iris_data = as_tibble(iris)
mean(iris_data$Sepal.Length)
## [1] 5.843333
median(iris_data$Petal.Width)
## [1] 1.3
mean((iris_data$Petal.Width * iris_data$Petal.Length))
## [1] 5.794067
table(iris_data$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50

Trying a Tibble | Takeaways

Data Wrangling

What is Data Wrangling?

But First, Some Programming Basics

Becoming an R Plumber

So our previous example of

a = function_three(function_two(function_one(x)))

…becomes…

a = x %>% function_one() %>% function_two() %>% function_three()

Simple!

round(pi, 3)
## [1] 3.142

…becomes…

pi %>% round(3)
## [1] 3.142

Advanced R Piping

iris_data %>% select(Sepal.Length) %>% as_vector() %>% tibble(Standardized.Length = (. / mean(.)) ) %>% head(1)
## # A tibble: 1 x 2
##       . Standardized.Length
##   <dbl>               <dbl>
## 1   5.1               0.873
std_var = . %>% as_vector() %>% tibble(Standardized.Var = (. / mean(.)) )
iris_data %>% select(Sepal.Length) %>% std_var() %>% head(1)
## # A tibble: 1 x 2
##       . Standardized.Var
##   <dbl>            <dbl>
## 1   5.1            0.873
iris_data %>% select(Sepal.Length) %>% as_vector() %>% {tibble(Standardized.Length = (. / mean(.)) )} %>% head(1)
## # A tibble: 1 x 1
##   Standardized.Length
##                 <dbl>
## 1               0.873

Tidy Data

Using pivot_wider()

table2 %>% head(4)
## # A tibble: 4 x 4
##   country      year type          count
##   <chr>       <int> <chr>         <int>
## 1 Afghanistan  1999 cases           745
## 2 Afghanistan  1999 population 19987071
## 3 Afghanistan  2000 cases          2666
## 4 Afghanistan  2000 population 20595360
table2 %>% pivot_wider(names_from = "type", values_from = "count") %>% head(4)
## # A tibble: 4 x 4
##   country      year cases population
##   <chr>       <int> <int>      <int>
## 1 Afghanistan  1999   745   19987071
## 2 Afghanistan  2000  2666   20595360
## 3 Brazil       1999 37737  172006362
## 4 Brazil       2000 80488  174504898

Using pivot_longer()

table1 %>% head(4)
## # A tibble: 4 x 4
##   country      year cases population
##   <chr>       <int> <int>      <int>
## 1 Afghanistan  1999   745   19987071
## 2 Afghanistan  2000  2666   20595360
## 3 Brazil       1999 37737  172006362
## 4 Brazil       2000 80488  174504898
table1 %>% mutate_all(as.character) %>% pivot_longer(cols = everything()) %>% head(4)
## # A tibble: 4 x 2
##   name       value      
##   <chr>      <chr>      
## 1 country    Afghanistan
## 2 year       1999       
## 3 cases      745        
## 4 population 19987071

Using unite()

table1 %>% head(4)
## # A tibble: 4 x 4
##   country      year cases population
##   <chr>       <int> <int>      <int>
## 1 Afghanistan  1999   745   19987071
## 2 Afghanistan  2000  2666   20595360
## 3 Brazil       1999 37737  172006362
## 4 Brazil       2000 80488  174504898
table1 %>% unite("info", cases, population) %>% head(4)
## # A tibble: 4 x 3
##   country      year info           
##   <chr>       <int> <chr>          
## 1 Afghanistan  1999 745_19987071   
## 2 Afghanistan  2000 2666_20595360  
## 3 Brazil       1999 37737_172006362
## 4 Brazil       2000 80488_174504898

Using separate()

table3 %>% head(4)
## # A tibble: 4 x 3
##   country      year rate           
##   <chr>       <int> <chr>          
## 1 Afghanistan  1999 745/19987071   
## 2 Afghanistan  2000 2666/20595360  
## 3 Brazil       1999 37737/172006362
## 4 Brazil       2000 80488/174504898
table3 %>% separate(rate, c("cases", "population")) %>% head(4)
## # A tibble: 4 x 4
##   country      year cases population
##   <chr>       <int> <chr> <chr>     
## 1 Afghanistan  1999 745   19987071  
## 2 Afghanistan  2000 2666  20595360  
## 3 Brazil       1999 37737 172006362 
## 4 Brazil       2000 80488 174504898

Excluding Variables

table1 %>% head(4)
## # A tibble: 4 x 4
##   country      year cases population
##   <chr>       <int> <int>      <int>
## 1 Afghanistan  1999   745   19987071
## 2 Afghanistan  2000  2666   20595360
## 3 Brazil       1999 37737  172006362
## 4 Brazil       2000 80488  174504898
table1 %>% pivot_longer(cols = -c(country, year), names_to = "type", values_to = "count",) %>% head(4)
## # A tibble: 4 x 4
##   country      year type          count
##   <chr>       <int> <chr>         <int>
## 1 Afghanistan  1999 cases           745
## 2 Afghanistan  1999 population 19987071
## 3 Afghanistan  2000 cases          2666
## 4 Afghanistan  2000 population 20595360

Filtering a Data-set

table1 %>% filter(country == "China")
## # A tibble: 2 x 4
##   country  year  cases population
##   <chr>   <int>  <int>      <int>
## 1 China    1999 212258 1272915272
## 2 China    2000 213766 1280428583
table1 %>% filter(cases > 3000)
## # A tibble: 4 x 4
##   country  year  cases population
##   <chr>   <int>  <int>      <int>
## 1 Brazil   1999  37737  172006362
## 2 Brazil   2000  80488  174504898
## 3 China    1999 212258 1272915272
## 4 China    2000 213766 1280428583

Creating or Modifying Variables

table1 %>% mutate(rate = cases / population)
## # A tibble: 6 x 5
##   country      year  cases population      rate
##   <chr>       <int>  <int>      <int>     <dbl>
## 1 Afghanistan  1999    745   19987071 0.0000373
## 2 Afghanistan  2000   2666   20595360 0.000129 
## 3 Brazil       1999  37737  172006362 0.000219 
## 4 Brazil       2000  80488  174504898 0.000461 
## 5 China        1999 212258 1272915272 0.000167 
## 6 China        2000 213766 1280428583 0.000167
table1 %>% mutate(year = year %>% as.character() %>% str_sub(3))
## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 99       745   19987071
## 2 Afghanistan 00      2666   20595360
## 3 Brazil      99     37737  172006362
## 4 Brazil      00     80488  174504898
## 5 China       99    212258 1272915272
## 6 China       00    213766 1280428583

Importing Data

Everything Else

Analyzing Student Data

  1. Download the CSV file student_data.csv and copy it to your project directory
  2. Install and import the here package
  3. Import the data using the appropriate function from the readr package (learn more about the package on the tidyverse website)
  4. Rearrange the data to only have variables for first, middle, and last names, school, year, and math, English, science, and social studies grades
  5. Calculate the GPA for each student for each year (Hint: it may be easier to NOT use the mean() function)
  6. Find the average GPA for each school in 2018 (This can be done all in one step with the aggregate() function for bonus points)

Analyzing Student Data | Answers

library(here)
student_data = read_csv(here("student_data.csv"))
gathered_student_data = student_data %>% select(-age, -grade_level) %>% 
  pivot_longer(c(-first, -middle, -last, -school), names_to = "key", values_to = "grade") %>%
  separate(key, c("class", "year"), sep = "__") 
gathered_student_data %>% head(4)
## # A tibble: 4 x 7
##   first     middle last   school  class                year  grade
##   <chr>     <chr>  <chr>  <chr>   <chr>                <chr> <dbl>
## 1 Krimhilde Yuri   Hierro Oakwood math_grade           2015     NA
## 2 Krimhilde Yuri   Hierro Oakwood english_grade        2015     NA
## 3 Krimhilde Yuri   Hierro Oakwood science_grade        2015     NA
## 4 Krimhilde Yuri   Hierro Oakwood social_studies_grade 2015     NA
final_student_data = gathered_student_data %>% pivot_wider(names_from = "class", values_from = "grade") %>% 
  mutate(gpa = (math_grade + english_grade + science_grade + social_studies_grade) / 4)
final_student_data %>% head(4)
## # A tibble: 4 x 10
##   first middle last  school year  math_grade english_grade science_grade
##   <chr> <chr>  <chr> <chr>  <chr>      <dbl>         <dbl>         <dbl>
## 1 Krim~ Yuri   Hier~ Oakwo~ 2015          NA            NA            NA
## 2 Krim~ Yuri   Hier~ Oakwo~ 2016          72            64            64
## 3 Krim~ Yuri   Hier~ Oakwo~ 2017          72            58            63
## 4 Krim~ Yuri   Hier~ Oakwo~ 2018          65            57            62
## # ... with 2 more variables: social_studies_grade <dbl>, gpa <dbl>
# the easy way, repeat for each school
final_student_data %>% filter(year == 2018, school == "Oakwood") %>% select(gpa) %>% unlist() %>% mean()
## [1] 60.55165
# the harder way
final_student_data %>% filter(year == 2018) %>% {aggregate(.$gpa, by = list(school = .$school), mean)}
##         school        x
## 1      Oakwood 60.55165
## 2   Pine Field 62.76240
## 3 Shady Willow 66.17926

Communicating Results

Data Visualization

ggplot2 Basics

Making a Simple Visualization

  1. Select a data source - this can be piped in, which is useful if the data needs to be wrangled into the correct form for the visualization
  2. Specify default aesthetic mappings - x, y, color, and fill are the most common, but some geometries will use more or only a portion of these
  3. Select a geometry function (or several) - each of these are prefaced by geom_ (e.g. geom_point())
  4. Save to a variable to add more layers or view later
displ_year_plot = ggplot(mpg, aes(x = displ, y = hwy)) + geom_point()
displ_year_plot

Making a Slightly Less Simple Visualization

displ_year_plot = ggplot(mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class))
displ_year_plot

displ_year_plot + geom_smooth(color = "grey60")

Making a Complicated Visualization

mpg %>% arrange(-cyl) %>% ggplot(aes(x = displ, y = hwy)) + 
  geom_point(aes(color = class, size = cyl)) + 
  geom_smooth(color = "grey60") + 
  scale_radius(range = c(2, 7)) + 
  facet_wrap(~ year) + theme(legend.position = "bottom") + 
  labs(title = "Cars with Larger Engines get Worse Fuel Efficiency", subtitle = "At highway speeds in both 1999 and 2008", 
       x = "Engine Displacement  (L)", y = "Highway Fuel Efficiency (mpg)", color = "Vehicle \nClass", size = "Number of \nCylinders")

Introduction to R Markdown

Making an R Markdown document

Basic Markdown

Visualizing Student Data

Create an R Markdown document knitted into HTML with visualizations to answer the following questions:

  1. Is one school overrepesented in the data-set?
  2. Is there a relationship between math and English grades?
  3. Which school has the highest GPAs?
  4. Do grades by subject vary by school?

Visualizing Student Data | Answers

final_student_data %>% ggplot(aes(x = school)) + geom_bar()

final_student_data %>% ggplot(aes(x = math_grade, y = english_grade, color = school)) + geom_point() + geom_smooth(color = "grey60")

final_student_data %>% ggplot(aes(x = school, y = gpa, fill = school)) + geom_violin()

gathered_student_data %>% ggplot(aes(x = school, y = grade, fill = school)) + geom_violin() + facet_wrap(~class)

Programming

Programming in R

if Statement

grade_level = 12

if (grade_level > 8) {
  print("high school")
}
## [1] "high school"

if else Statement

grade_level = 7

if (grade_level > 8) {
  print("high school")
} else if(grade_level > 5) {
  print("middle school")
} else {
  print("elementary school")
}
## [1] "middle school"

while Loops

grade_level = 6
while (grade_level < 12) {
  print(grade_level)
  grade_level = grade_level + 1
}
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11

for Loops

for (cur_grade_level in 6:12) {
  print(cur_grade_level)
}
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12

The apply Family

Performance Comparisons

library(tictoc)
rand_numbers = runif(10^6, 0, 10) %>% matrix(ncol = 1000) %>% as_tibble()
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
tic()
output = double(nrow(rand_numbers))
for (i in 1:nrow(rand_numbers)) {
  output[i] = rand_numbers[i, ] %>% unlist() %>% mean()
}
toc()
## 1.53 sec elapsed
tic()
output = apply(rand_numbers, 1, mean)
toc()
## 0.06 sec elapsed
tic()
output = rowMeans(rand_numbers)
toc()
## 0.02 sec elapsed

Functions

Starting a Function

my_function = function(argument1, argument2) {
  # adds argument1 and argument2
  result = argument1 + argument2
}

More About Arguments

a = 3
# ... passes additional arguments to the print function
do_something_weird = function(a, ..., b = 5) {
  print(rep(a, times = b), ...)
}

do_something_weird(1)
## [1] 1 1 1 1 1
do_something_weird(9, 7)
## [1] 9 9 9 9 9
do_something_weird(5.63, digits = 1)
## [1] 6 6 6 6 6

Ending a Function

# now returns the value instead of printing it to the console
do_something_weird = function(a, ..., b = 5) {
  return(rep(a, times = b, ...))
}

ones = do_something_weird(1)
ones
## [1] 1 1 1 1 1
do_something_weird(9, 7) %>% sum()
## [1] 63

Converting Grades

Converting Grades | Answer

convert_grade = function(grade, input_type = "default") {
  grade = as.vector(unlist(grade))
  if (min(grade) < 0 | !is.numeric(grade)) stop("Invalid grade")
  
  if (input_type == "default") {
    if (max(grade) < 4) {
      input_type = "four"
      message("Converting to 4 point scale")
    } else {
      input_type = "hundred"
      message("Converting to 100 point scale")
    }
  }
  
  conv_gr = Vectorize(function(grade, input_type) {
    if (input_type == "hundred") {
      if (grade > 100) stop("Invalid grade")
      if (grade > 89) return("A")
      if (grade > 79) return("B")
      if (grade > 69) return("C")
      if (grade > 59) return("D")
      return("F")
    }
    if (input_type == "hundred") {
      if (grade > 4) stop("Invalid grade")
      if (grade > 3.7) return("A")
      if (grade > 2.7) return("B")
      if (grade > 1.7) return("C")
      if (grade > 1.0) return("D")
      return("F")
    }
  })
  
  return(conv_gr(grade, input_type))
}

student_data %>% select(math_grade__2018) %>% convert_grade()
## Converting to 100 point scale
##   [1] "D" "C" "D" "B" "D" "F" "B" "C" "D" "D" "D" "C" "D" "F" "C" "D" "C" "C"
##  [19] "C" "D" "D" "B" "C" "D" "D" "C" "C" "F" "D" "D" "D" "C" "C" "D" "B" "C"
##  [37] "D" "C" "C" "B" "D" "D" "C" "D" "C" "C" "F" "C" "B" "D" "A" "C" "F" "F"
##  [55] "F" "D" "C" "F" "F" "B" "D" "C" "C" "C" "F" "D" "D" "D" "C" "D" "C" "C"
##  [73] "B" "B" "D" "C" "C" "C" "F" "D" "C" "C" "D" "F" "D" "C" "C" "D" "B" "B"
##  [91] "B" "C" "D" "C" "F" "D" "B" "C" "C" "D" "C" "D" "D" "C" "A" "B" "D" "C"
## [109] "D" "D" "B" "C" "B" "F" "D" "C" "B" "B" "D" "C" "B" "D" "C" "D" "C" "C"
## [127] "C" "F" "F" "D" "C" "C" "D" "C" "C" "B" "C" "F" "C" "C" "C" "B" "C" "D"
## [145] "D" "F" "B" "B" "B" "C" "D" "D" "D" "C" "C" "B" "C" "D" "C" "B" "B" "A"
## [163] "C" "D" "D" "C" "D" "D" "C" "B" "C" "A" "D" "D" "C" "D" "F" "C" "D" "F"
## [181] "C" "C" "D" "D" "D" "D" "C" "F" "C" "D" "C" "C" "C" "B" "B" "C" "D" "D"
## [199] "D" "F" "C" "C" "D" "C" "C" "C" "C" "D" "B" "C" "B" "C" "D" "C" "F" "C"
## [217] "D" "F" "A" "B" "D" "D" "C" "C" "F" "D" "D" "C" "C" "D" "F" "C" "C" "D"
## [235] "C" "D" "D" "B" "F" "B" "F" "D" "D" "C" "C" "C" "D" "C" "B" "C" "D" "F"
## [253] "C" "F" "C" "D" "D" "D" "C" "D" "D" "C" "D" "D" "C" "D" "C" "C" "C" "C"
## [271] "D" "D" "C" "D" "C" "A" "D" "D" "F" "D" "C" "D" "C" "F" "D" "C" "D" "C"
## [289] "B" "C" "A" "C" "D" "C" "F" "F" "D" "C" "A" "C" "B" "B" "C" "D" "F" "B"
## [307] "C" "C" "D" "D" "C" "D" "B" "B" "C" "C" "C" "D" "D" "B" "D" "C" "C" "C"
## [325] "C" "B" "B" "C" "B" "D" "B" "D" "D" "B" "F" "C" "F" "B" "F" "B" "D" "D"
## [343] "C" "D" "C" "D" "D" "A" "D" "F" "C" "A" "C" "C" "C" "D" "D" "A" "C" "D"
## [361] "C" "C" "F" "C" "F" "A" "D" "C" "C" "D" "F" "F" "C" "C" "B" "D" "C" "B"
## [379] "B" "B" "F" "B" "C" "D" "C" "C" "B" "B" "A" "B" "C" "C" "C" "D" "F" "B"
## [397] "B" "C" "C" "C" "C" "D" "D" "D" "D" "B" "B" "C" "D" "D" "C" "B" "D" "C"
## [415] "D" "C" "B" "C" "F" "D" "D" "D" "D" "D" "F" "C" "D" "C" "F" "D" "C" "B"
## [433] "C" "D" "F" "B" "D" "D" "C" "D" "D" "C" "C" "D" "B" "C" "C" "D" "C" "C"
## [451] "C" "D" "D" "D" "D" "C" "D" "C" "F" "F" "D" "F" "C" "B" "D" "D" "B" "D"
## [469] "B" "C" "C" "C" "B" "F" "A" "C" "D" "C" "F" "F" "D" "F" "D" "C" "C" "D"
## [487] "B" "C" "C" "C" "D" "B" "C" "C" "D" "C" "F" "C" "C" "C"
convert_grade(103)
## Converting to 100 point scale
## Error in (function (grade, input_type) : Invalid grade

Wrapping Up