R Learning

knitr::opts_chunk$set(
  echo = TRUE,
  eval = FALSE
)

Control Flow

if(condition){
  expr
}

x <- -3

if(x < 0){
  print("x is a negative number")
}

Redo with positive:

y <- 3
if(y < 0){
  print("y is a negative number")
}
# yields no output (because result is FALSE)

hence we want to add an else statement:

if(condition){
  expr1
} else{ # runs whenever condition is FALSE
  expr2
}

if(x < 0){
  print("x is a negative number")
} else {
  print("x is a positive number or zero")
}

additional elseif statement

if(condition1){
  expr1
} elseif(condition2){
  expr2
} else {
  expr3
}

if(x < 0){
  print("x is a negative number")
} elseif(x == 0) {
  print("x is zero")
} else{
  print("x is a positive number")
}

As soon as R sees a condition as TRUE, it executes the {} and exits the rest of the control structure

x <- 6

if(x %% 2 == 0){
  print("divisible by 2")
} elseif(x %% 3 == 0) {
  print("divisible by 3")
} else{
  print("not divisible by 2 or 3")
}

While Loops

While loop executes code inside if condition is TRUE, but will continue to execute the code as long as condition is TRUE.

ctr <- 1 
while(ctr <= 7){
  print(paste("ctr is set to", ctr))
  ctr <- ctr + 1 # work our way up by 1 - without this line, would always be 1 and go on forever!! executes forever
}
ctr # now becomes 8 at end of loop

always make sure the loop ends at some point! Otherwise will have to force terminate

break statement breaks out of the while loop

stop the loop as soon as we reach a value divisible by 5

ctr <- 1 
while(ctr <= 7){
  if(ctr %% 5 == 0){
    break
  }
  print(paste("ctr is set to", ctr))
  ctr <- ctr + 1 # work our way up by 1 - without this line, would always be 1 and go on forever!! executes forever
}
ctr # now becomes 8 at end of loop

for loop

for(var in seq){
  expr
}

cities <- c("New York", "Paris",
            "London", "Tokyo",
            "Rio de Janiero", "Cape Town")

for(city in cities){
  print(city)
}

also works with lists

cities <- list("New York", "Paris",
            "London", "Tokyo",
            "Rio de Janiero", "Cape Town")

for(city in cities){
  print(city)
}

also matrices and dataframes!

control statements for for loops:

breaks

# terminate the loop upon detection of a city name that is 6 characters long
for(city in cities){
  if(nchar(city) == 6){
    break
  }
  print(city)
}

# move to next element when a city name that is 6 characters long is detected (London)
for(city in cities){
  if(nchar(city) == 6){
    next # skips remainder of code  
  }
  print(city)
}

# prints all but London

We can manually create a looping index vector ourselves, R automatically starts vector at [1] (first element - New York):

But now we lose our city variable, so we have to subset the vector explicitly with []

for(i in 1:length(cities)){
  print(cities[i])
}

# matrices!
#ttt
#      [,1] [,2] [,3]
# [1,] "O"  NA   "X" 
# [2,] NA   "O"  "O" 
# [3,] "X"  NA   "X" 

# The tic-tac-toe matrix ttt has already been defined for you

# define the double for loop
for (i in 1:nrow(ttt)) {
  for (j in 1:ncol(ttt)) {
    print(paste("On row", i, "and column", j, "the board contains", ttt[i,j]))
  }
}

Functions

args() function is useful to learn about the arguments required for a function.

args(sd)

triple <- function(x){
  3 * x
}

ls() # its now listed in our environment

triple(6)

Numeric 6 matched to (first and only) argument x by position. Function body then executes, the last expression evaluated in an R function becomes the return value.

We can also explicitly specify the return value, by defining an object to be returned:

triple <- function(x){
  y <- 3 * x
  return(y)
}

Not always necessary to have a return statement. But useful inside a function, often as the result of an if conditional which might break a function.

Function scoping

An issue that Filip did not discuss in the video is function scoping. It implies that variables that are defined inside a function are not accessible outside that function. Try running the following code and see if you understand the results:

pow_two <- function(x) {
  y <- x ^ 2
  return(y)
}
pow_two(4)
y
x

y was defined inside the pow_two() function and therefore it is not accessible outside of that function. This is also true for the function’s arguments of course - x in this case.

Which statement is correct about the following chunk of code? The function two_dice() is already available in the workspace.

two_dice <- function() {
  possibilities <- 1:6
  dice1 <- sample(possibilities, size = 1)
  dice2 <- sample(possibilities, size = 1)
  dice1 + dice2
}

lapply

lapply iterates a function over any input (vector, list, etc) but always returns a list!

nyc <- list(pop = 8405837,
            boroughs = c("Manhattan", "Bronx", "Brooklyn",
            "Queens", "Staten Island"),
            capital = FALSE
)

# get class of each element of list
for(info in nyc){
  print(class(info))
}

# instead of a for loop
lapply(nyc, class)

get number of characters of city names

cities <- c("New York", "Paris",
            "London", "Tokyo",
            "Rio de Janiero", "Cape Town")

num_chars <- c()
for(i in 1:length(cities)) {
  num_chars[i] <- nchar(cities[i])
}
num_chars

# now use lapply
lapply(cities, nchar)

# can turn it back to a vector with unlist()
unlist(lapply(cities, nchar))

can also use lapply with your own functions

oil_prices <- list(2.37, 2.48, 2.59)

triple <- function(x) {
  3 * x
}

result <- lapply(oil_prices, triple)

str(result)

unlist(result)

strsplit() calls, that splits the strings in pioneers on the : sign. tolower()

# Code from previous exercise:
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split <- strsplit(pioneers, split = ":")
split_low <- lapply(split, tolower)

# Write function select_first()
select_first <- function(x) {
  x[1]
}

# Apply select_first() over split_low: names
names <- lapply(split_low, select_first)

# Write function select_second()
select_second <- function(x){
  x[2]
}

# Apply select_second() over split_low: years
years <- lapply(split_low, select_second)

lapply with additioanl arguments

# Definition of split_low
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")
split <- strsplit(pioneers, split = ":")
split_low <- lapply(split, tolower)

# Generic select function
select_el <- function(x, index) {
  x[index]
}


# Use lapply() twice on split_low: names and years
names <- lapply(split_low, select_el, index = 1)
years <- lapply(split_low, select_el, index = 2)

sapply

Because output of lapply is a losit

But for results that have the same type of data, can use sapply which actually uses simplify2array function to convert to a vector (1 dimensional array).

sapply(cities, nchar)

# can choose not to name the output
sapply(cities, nchar, USE.NAMES = FALSE)

first_and_last <- function(name) {
  name <- gsub(" ", "", name)
  letters <- strsplit(name, split = "")[[1]]
  c(first = min(letters), last = max(letters))
}

first_and_last("New York")

# makes a matrix!

sapply(cities, first_and_last)

what if unable to simplify?

unique_letters <- function(name) {
  name <- gsub(" ", "", name)
  letters <- strsplit(name, split = "")[[1]]
  unique(letters)
}

unique_letters("London")

lapply(cities,
       unique_letters)

# here can't simplify!
sapply(cities,
       unique_letters)

Recap of apply functions

lapply()
- apply function over list or vector
- output is list
sapply()
- apply function over list or vector
- try to simplify list to array (vector or matrix)
vapply()
- apply function over list or vector
- explicitly specify output format

vapply

vapply(x, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

vapply(cities, nchar, numeric(1)) # numeric vector of length 1, 1 for each element

vapply(cities, first_and_last, character(2)) # character vector of length 2, 2 for each element

# temp is already available in the workspace

# Definition of basics()
basics <- function(x) {
  c(min = min(x), mean = mean(x), max = max(x))
}

# Apply basics() over temp using vapply()
vapply(temp, basics, numeric(3))

Apply basics() over temp using vapply() vapply(temp, basics, numeric(3)) [,1] [,2] [,3] [,4] [,5] [,6] [,7] min -1.0 5 -3.0 -2.0 2.0 -3.0 1.0 mean 4.8 9 2.2 2.4 5.4 4.6 4.6 max 9.0 13 8.0 7.0 9.0 9.0 9.0

useful functions

applys
sort(x, desc = FALSE)
print()
identical()
seq()
rep()

rep(c(8,6,4,2), times = 2)

rep(c(8,6,4,2), each = 2)

checking data strutures

is.*()
as.*()

is.list(cities)
as.list(cities)

append() adds element to list/vector
rev() reverses elements in list/vector

regex

animals <- c("cat", "moose", "impala", "ant", "kiwi")

#grepl(pattern = <regex>, x = <string>)

grepl(pattern = "a",
      x = animals)

grepl(pattern = "^a", # starts with a
      x = animals)

grepl(pattern = "a$", # ends with a
      x = animals)

# grepl outputs a vector of logicals
# grep outputs a vector of the position of TRUEs

grep(pattern = "a", animals)

# use which() to get the grep from grepl

which(grepl(pattern = "a",
      x = animals))

sub() and gsub() replace characters in strings

sub(pattern = <regex>, replacement = <str>, x = <str>

sub(pattern = "a", replacement = "o", animals) # replace all 'a's with 'o's

# look at impala, only the FIRST a has been replaced with o!
# sub only looks for FIRST match in string! then it stops!

# replace EVERY a with o with gsub
gsub(pattern = "a", replacement = "o", animals) # replace all 'a's with 'o's

# use | for OR 
# replace all a's or i's with _

gsub(pattern = "a|i", replacement = "_", animals) # replace all 'a's with 'o's

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use grepl() to match for .edu addresses more robustly
grepl("@.*edu", emails)

# Use grep() to match for .edu addresses more robustly, save result to hits
hits <- grep("@.*edu", emails)

# Subset emails using hits
emails[hits]

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "global@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use sub() to convert the email domains to datacamp.edu
sub("@.*\\.edu$", "@datacamp.edu", emails)

Regular expressions are a typical concept that you’ll learn by doing and by seeing other examples. Before you rack your brains over the regular expression in this exercise, have a look at the new things that will be used:

.*: A usual suspect! It can be read as “any character that is matched zero or more times”.
\\s: Match a space. The “s” is normally a character, escaping it (\\) makes it a metacharacter.
[0-9]+: Match the numbers 0 to 9, at least once (+).
([0-9]+): The parentheses are used to make parts of the matching string available to define the replacement. The \\1 in the replacement argument of sub() gets set to the string that is captured by the regular expression [0-9]+.

awards <- c("Won 1 Oscar.",
  "Won 1 Oscar. Another 9 wins & 24 nominations.",
  "1 win and 2 nominations.",
  "2 wins & 3 nominations.",
  "Nominated for 2 Golden Globes. 1 more win & 2 nominations.",
  "4 wins & 1 nomination.")

sub(".*\\s([0-9]+)\\snomination.*$", "\\1", awards)

[1] "Won 1 Oscar." "24"           "2"            "3"            "2"           
[6] "1"

The ([0-9]+) selects the entire number that comes before the word “nomination” in the string, and the entire match gets replaced by this number because of the \1 that reference to the content inside the parentheses.

Dates and Times

To create a Date object from a simple character string in R, you can use the as.Date() function. The character string has to obey a format that can be defined using a set of symbols (the examples correspond to 13 January, 1982):

%Y: 4-digit year (1982)
%y: 2-digit year (82)
%m: 2-digit month (01)
%d: 2-digit day of the month (13)
%A: weekday (Wednesday)
%a: abbreviated weekday (Wed)
%B: month (January)
%b: abbreviated month (Jan)

The following R commands will all create the same Date object for the 13th day in January of 1982:

as.Date(“1982-01-13”) as.Date(“Jan-13-82”, format = “%b-%d-%y”) as.Date(“13 January, 1982”, format = “%d %B, %Y”)

Notice that the first line here did not need a format argument, because by default R matches your character string to the formats “%Y-%m-%d” or “%Y/%m/%d”.

In addition to creating dates, you can also convert dates to character strings that use a different date notation. For this, you use the format() function. Try the following lines of code:

today <- Sys.Date() format(Sys.Date(), format = “%d %B, %Y”) format(Sys.Date(), format = “Today is a %A!”)

# Definition of character strings representing dates
str1 <- "May 23, '96"
str2 <- "2012-03-15"
str3 <- "30/January/2006"

# Convert the strings to dates: date1, date2, date3
date1 <- as.Date(str1, format = "%b %d, '%y")
date2 <- as.Date(str2, format = "%Y-%m-%d")
date3 <- as.Date(str3, format = "%d/%B/%Y")

# Convert dates to formatted strings
format(date1, "%A") # extract weekday
format(date2, "%d") # extract day of month
format(date3, "%b %Y") # extract abbrv month and year

Similar to working with dates, you can use as.POSIXct() to convert from a character string to a POSIXct object, and format() to convert from a POSIXct object to a character string. Again, you have a wide variety of symbols:

%H: hours as a decimal number (00-23)
%I: hours as a decimal number (01-12)
%M: minutes as a decimal number
%S: seconds as a decimal number
%T: shorthand notation for the typical format %H:%M:%S
%p: AM/PM indicator

For a full list of conversion symbols, consult the strptime documentation in the console:

?strptime

Again,as.POSIXct() uses a default format to match character strings. In this case, it’s %Y-%m-%d %H:%M:%S. In this exercise, abstraction is made of different time zones.

Supervised Machine Learning (clustering)

Classification

Machiens measure distance between cases to determine similarity (stop sign image closer to other stop sign image than a speed limit sign, and likewise).

Imagines properties of signs as coordinates in “feature space”, e.g. color. Imagine color as a 3-dimensional feature measuring levels of red, green, and blue. Stop signs are clustered on higher ends of red feature, etc. Then in feature space you can measure distance (for example in usual Euclidean ways - straight line distance between 2 points)

k-nearest neighbors (KNN) searches dataset for observation closest to newly observed one. Comes from class package.

library(class) 
pred <- knn(training_data, testing_data, training_labels)

# has df of signs, and next_sign

# Load the 'class' package
library(class)

# Create a vector of labels
sign_types <- signs$sign_types

# Classify the next sign observed
knn(train = signs[-1], test = next_sign, cl = sign_types)

Snapshot of what the data actually is

# Examine the structure of the signs dataset
str(signs)

# Count the number of signs of each type
table(signs$sign_type)

# Check r10's average red level by sign type
aggregate(r10 ~ sign_type, data = signs, mean)

# Use kNN to identify the test road signs
sign_types <- signs$sign_type 
signs_pred <- knn(train = signs[-1], test = test_signs[-1], cl = sign_types)
# train is without labels (first column) and test also (minus first column)

# Create a confusion matrix of the predicted versus actual values
signs_actual <- test_signs$sign_type
table(signs_pred, signs_actual)

# Compute the accuracy
mean(signs_pred == signs_actual)

R’s default value of k for knn is 1. Only the single closest other point is used to classify unlabeled example. Increasing k and getting different classes in the closest points puts them up to a vote. In case of a tie, winner is decided at random.

Not always case that increasing k leads to better results.

See slide example.

Small k creates very small neighborhoods, discover very subtle patterns. Can distinguish between groups even when boundary is fuzzy.

But sometimes a fuzzy boundary isn’t actually the true pattern but result of other factor that adds randomness into the data - “noise”.

Setting k larger ignores some potential noise to discover a broader, more general pattern.

No universal rule for determining k. Some suggest rule of thumb $k = \sqrt{n_{t r a i n i n g}}$ e.g. if car had observed 100 previous signs, use k=10.

# Compute the accuracy of the baseline model (default k = 1)
k_1 <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types)
mean(signs_actual == k_1)

# Modify the above to set k = 7
k_7 <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types, k = 7)
mean(signs_actual == k_7)

# Use the prob parameter to get the proportion of votes for the winning class
sign_pred <- knn(train = signs[-1], test = signs_test[-1], cl = sign_types, k = 7, prob = TRUE)

# Get the "prob" attribute from the predicted classes
sign_prob <- attr(sign_pred, which = "prob")

# Examine the first several predictions
head(sign_pred)

# Examine the proportion of votes for the winning class
head(sign_prob)

Preparing Data for KNN

Knn assumes numeric data (distance functions). Can use dummy variables.

To calculate distance, each feature of the input data should be measured with the same range of values (i.e. consider a dummy variable vs a variable for price or color intensity /255) - the larger valued inputs would get more weight and be wrong.

R has no built in function to rescale data to a given range, so need to create one

Using Bayesian methods

using historical data to forecast future probability

Suppose my phone checked my location 40 times. 23 were at work, 4 were at store

Then prob(at work) = 23/40 = 57.5%

But to give more accurate prediction of where I’ll be, need other factors like time of day. Need to combine information from several events into a single prob. estimate - joint probability

joint prob of A & B = P(A and B). If A & B are independent, P(A and B) = P(A) x P(B)

Conditional probability: P(A|B)= P(A and B)/P(B).

P(work|evening) = 1/25 = 4% P(work|afternoon) = 20/25 = 80%

“Naive Bayes” algorithm applies Bayesian methods to estimate conditional probability of an outcome.

Package called naivebayes

library(naivebayes)

# Build a Naive Bayes model
m <- naive_bayes(location ~ time_of_day, data = location_history)

# making predictions
future_location <- predict(m, future_conditions)

# using where9am data

# Compute P(A) 
p_A <- nrow(subset(where9am, location == "office"))/nrow(where9am)

# Compute P(B)
p_B <- nrow(subset(where9am, daytype == "weekday"))/nrow(where9am)

# Compute the observed P(A and B)
p_AB <- nrow(subset(where9am, daytype == "weekday", location == "office"))/nrow(where9am)

# Compute P(A | B) and print its value
p_A_given_B <- p_AB / p_B
p_A_given_B

thursday9am <- data.frame(daytype = "weekday")

# Load the naivebayes package
library(naivebayes)

# Build the location prediction model
locmodel <- naive_bayes(location ~ daytype, data = where9am)

# Predict Thursday's 9am location
predict(locmodel, thursday9am)

# Predict Saturdays's 9am location
predict(locmodel, saturday9am)

The naivebayes package offers several ways to peek inside a Naive Bayes model.

Typing the name of the model object provides the a priori (overall) and conditional probabilities of each of the model’s predictors. If one were so inclined, you might use these for calculating posterior (predicted) probabilities by hand.

Alternatively, R will compute the posterior probabilities for you if the type = “prob” parameter is supplied to the predict() function.

Using these methods, examine how the model’s predicted 9am location probability varies from day-to-day. The model locmodel that you fit in the previous exercise is available for you to use, and the naivebayes package has been pre-loaded.

 # Examine the location prediction model
locmodel

# Obtain the predicted probabilities for Thursday at 9am
predict(locmodel, thursday9am , type = "prob")

# Obtain the predicted probabilities for Saturday at 9am
predict(locmodel, saturday9am , type = "prob")

Naive bayes is “naive” because when multiple events are used to predict probability, it doesn’t actually measure the prob, it uses a shortcut to approximate the cond. prob.

Rather than treating the problem as the intersection of all the relevant events, the algorithm makes the “naive” assumption that the events are independent.

If A & B are independent, P(A & B) = P(A) x P(B)

So the algo doesn’t observe all the intersections in a venn diagram, it simply multiplies probs from a much simpler series of intersections (see slides)

But the Naive Bayes model still performs admirably in many applied tasks

But one other issue: suppose one event has never been observed simultaneously with the outcome event, their joint probability (A & B) is 0. But whenever 0 is multipleid in a chain, the whole chain becomes 0.

Simple solution is usually to add a small number (like 1) to each event and outcome combination “the Laplace correction”

# Build a NB model of location
locmodel <- naive_bayes(location ~ daytype + hourtype, data = locations)

# returns:
# Warning message: naive_bayes(): Feature daytype - zero probabilities are present. Consider Laplace smoothing.


# Predict Brett's location on a weekday afternoon
predict(locmodel, weekday_afternoon)

# Predict Brett's location on a weekday evening
predict(locmodel, weekday_evening)

with laplace parameter

# Observe the predicted probabilities for a weekend afternoon
predict(locmodel, weekend_afternoon, type = "prob")

# Build a new model using the Laplace correction
locmodel2 <- naive_bayes(location ~ daytype + hourtype, data = locations, laplace = 1)

# Observe the new predicted probabilities for a weekend afternoon
predict(locmodel2, weekend_afternoon, type = "prob")

Naive bayes works well where multiple attributes need to be considered simultaneously and evaluated as a whole - like how a doctor might evaluate symptoms and test results to make a diagnosis

Historically also used for classifying text data (like whether or not an email is spam)

Some challenges to other classification tasks:

Beginning with a tabular dataset, it builds frequency tables for the number of times each event overlaps with the outcome of interest. The probs are then multiplied, naively, in a chain of all the events. Each of the predictors typically comprises a set of categories (numeric data like age, time of day are hard to use). So we need to prepare these types of data in advance

Binning is a simple method to create categories from numeric data - divide range of numbers into a series of sets called bins. Time of day -> afternoon, morning, evening. Temperature values -> hot, warm, cool, cold

Preparing text documents similarly is difficult and needs preparation of the data first. Common process to use bag-of-words model. Doesn’t consider word order, grammar, or semantics, it just creates an event for each word that appears in text documents. - Wide table data where rows are documents and columns are words that may appear in them - Each cell indicates whether or not the word appears in the document

Naive bayes can calculate the prob of the outcome given the evidence provided by the words in the text

e.g. document with words “viagra” and “prescription” is more likely ot be spam than a document that contains words “naive” and “Bayes”

Logistic Regression ML

m <- glm(y ~ x1 + x2 + x3,
         data = df,
         family = "binomial")

# predicts probabilities, rather than just log-odds!
prob <- predict(m, test_dataset,
                type = "response")

# to make predictions, convert to outcomes of interest
pred <- ifelse(prob > 0.50,
               1, # predict 1 if p>0.50
               0) # predict 0 if p<=0.50

# Examine the dataset to identify potential independent variables
str(donors)

# Explore the dependent variable
table(donors$donated)

# Build the donation model
donation_model <- glm(donated ~ bad_address + interest_religion + interest_veterans, 
                      data = donors, family = "binomial")

# Summarize the model results
summary(donation_model)

As with many of R’s machine learning methods, you can apply the predict() function to the model object to forecast future behavior. By default, predict() outputs predictions in terms of log odds unless type = “response” is specified. This converts the log odds to probabilities.

Because a logistic regression model estimates the probability of the outcome, it is up to you to determine the threshold at which the probability implies action. One must balance the extremes of being too cautious versus being too aggressive. For example, if you were to solicit only the people with a 99% or greater donation probability, you may miss out on many people with lower estimated probabilities that still choose to donate. This balance is particularly important to consider for severely imbalanced outcomes, such as in this dataset where donations are relatively rare.

# Estimate the donation probability
donors$donation_prob <- predict(donation_model, type = "response")

# Find the donation probability of the average prospect
mean(donors$donated)

# [1] 0.05040551

# Predict a donation if probability of donation is greater than average (0.0504)
donors$donation_pred <- ifelse(donors$donation_prob > 0.0504, 1, 0)

# Calculate the model's accuracy
mean(donors$donation_pred == donors$donated)

# [1] 0.794815

Model performance tradeoffs

When one outcome (0,1) is very rare, predicting the opposite can result in a very high accuracy. (e.g. because only 5% of people are donors, predicting non-donation results in overall acurracy of 95%, but accuracy of 0% on outocme that mattered most: donations).

It may be important in cases like these to sacrifice some overall accuracy to better target the outocme of interest.

Visualize ROC curve, to distinguish between + and - predictions of outcome of interest versus all others.

Classifier is trying to distinguish between X (1) and O (0).

ROC curve depicts relationshiop between % of + examples [sensitivity] as it relates to % of other outcomes [specificity].

Diagonal line is baseline for very poor model. The further above it another curve is, the better it is performing.

A measurement called AUC (Area Under Curve) quantifies this performance. Baseline model (random chance) has AUC of 0.50. Perfect model has AUC of 1, curve is upper left of the square.

Some cases where AUC can be misleading. Curves of different shapes/bend-locations can have same AUC. Should look at shape of AUC too to see how model is performing across rangeo f predictions.

# Load the pROC package
library(pROC)

# Create a ROC curve
ROC <- roc(donors$donated, donors$donation_prob)

# Plot the ROC curve
plot(ROC, col = "blue")

# Calculate the area under the curve (AUC)
auc(ROC)

glm will automatically create dummy variables for categorical data that are factors. Also by default, it will exlcude any missing values.

For missing numerical data, can use imputation. One simple strategy is mean imputation, imputing the mean.

Could also add new dummy variable to indicate whether value was missing or not (to see effect on prediction).

# Convert the wealth rating to a factor
donors$wealth_levels <- factor(donors$wealth_rating, levels = c(0,1,2,3), labels = c("Unknown", "Low", "Medium", "High"))

# Use relevel() to change reference category
donors$wealth_levels <- relevel(donors$wealth_levels, ref = "Medium")

# See how our factor coding impacts the model
summary(glm(donated ~ wealth_levels, data = donors, family = "binomial"))

# Find the average age among non-missing values
summary(donors$age)

# Impute missing age values with the mean age
donors$imputed_age <- ifelse(is.na(donors$age),
                             round(mean(donors$age, na.rm = TRUE), 2),
                             donors$age)

# Create missing value indicator for age
donors$missing_age <- ifelse(is.na(donors$age),
                             1,
                             0)

# Find the average age among non-missing values
summary(donors$age)

# Impute missing age values with the mean age
donors$imputed_age <- ifelse(is.na(donors$age), round(mean(donors$age, na.rm = TRUE), 2), donors$age)

# Create missing value indicator for age
donors$missing_age <- ifelse(is.na(donors$age), 1, 0)

Call:
glm(formula = donated ~ money + recency * frequency, family = "binomial", 
    data = donors)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3696  -0.3696  -0.2895  -0.2895   2.7924  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)                       -2.9999     0.3086  -9.721   <2e-16 ***
moneyHIGH                         -0.3619     0.0430  -8.415   <2e-16 ***
recencyCURRENT                    -0.1511     0.3094  -0.488    0.625    
frequencyFREQUENT                 -0.5164     0.5162  -1.000    0.317    
recencyCURRENT:frequencyFREQUENT   1.0179     0.5171   1.968    0.049 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 37330  on 93461  degrees of freedom
Residual deviance: 36938  on 93457  degrees of freedom
AIC: 36948

Number of Fisher Scoring iterations: 6

# Build a recency, frequency, and money (RFM) model
rfm_model <- glm(donated ~ recency * frequency + money, data = donors, family = "binomial")

# Summarize the RFM model to see how the parameters were coded
summary(rfm_model)

# Compute predicted probabilities for the RFM model
rfm_prob <- predict(rfm_model, data = donors, type = "response")

# Plot the ROC curve for the new model
library(pROC)
ROC <- roc(donors$donated, rfm_prob)
plot(ROC, col = "red")
auc(ROC)

Unlike other ML models, regression typically requires you to pick the features ahead of time.

Automatic feature selection can be used where you can’t or won’t be able to do this.

Stepwise regression runs a regression step by step, evaluating each predictor to see which ones add value to the final model.

Backward deletion procedure begins with model containing all predictors. Then checks to see what happens if each one of the predictors is removed. If removing a predictor doesn’t substantially lower model’s ability to predict outcome, it is removed. Continues step by step until only important predictors remain.

Forward selection goes in other direction, beginning with no predictors, examines effect of adding each predictor.

It is possible that the two methods might give you two completely different models!

Built in absence of theory or common sense! Violates assumptions about regressions ability to explain the causal mechanism. Only about prediction!!

Not used outside ML for these reasons.

# Specify a null model with no predictors
null_model <- glm(donated ~ 1, data = donors, family = "binomial")

# Specify the full model using all of the potential predictors
full_model <- glm(donated ~ .,
data = donors, family = "binomial")

# Use a forward stepwise algorithm to build a parsimonious model
step_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")

# Estimate the stepwise donation probability
step_prob <- predict(step_model, type = "response")

# Plot the ROC of the stepwise model
library(pROC)
ROC <- roc(donors$donated, step_prob)
plot(ROC, col = "red")
auc(ROC)

Start:  AIC=7948.5
donated ~ 1

                    Df Deviance    AIC
+ frequency          1   7874.4 7878.4
+ money              1   7924.8 7928.8
+ pet_owner          1   7936.3 7940.3
+ interest_religion  1   7937.3 7941.3
+ bad_address        1   7939.4 7943.4
<none>                   7946.5 7948.5
+ wealth_rating      3   7940.6 7948.6
+ has_children       1   7944.8 7948.8
+ imputed_age        1   7945.0 7949.0
+ interest_veterans  1   7945.0 7949.0
+ catalog_shopper    1   7945.9 7949.9
+ missing_age        1   7946.0 7950.0
+ veteran            1   7946.3 7950.3
+ recency            1   7946.4 7950.4

Step:  AIC=7878.4
donated ~ frequency

                    Df Deviance    AIC
+ money              1   7861.4 7867.4
+ pet_owner          1   7864.8 7870.8
+ interest_religion  1   7866.1 7872.1
+ bad_address        1   7866.5 7872.5
+ wealth_rating      3   7867.9 7877.9
<none>                   7874.4 7878.4
+ has_children       1   7872.9 7878.9
+ interest_veterans  1   7873.3 7879.3
+ missing_age        1   7873.9 7879.9
+ imputed_age        1   7874.0 7880.0
+ catalog_shopper    1   7874.0 7880.0
+ veteran            1   7874.2 7880.2
+ recency            1   7874.3 7880.3

Step:  AIC=7867.41
donated ~ frequency + money

                    Df Deviance    AIC
+ pet_owner          1   7852.0 7860.0
+ interest_religion  1   7853.4 7861.4
+ bad_address        1   7853.7 7861.7
+ wealth_rating      3   7854.3 7866.3
<none>                   7861.4 7867.4
+ has_children       1   7859.8 7867.8
+ interest_veterans  1   7860.3 7868.3
+ missing_age        1   7860.8 7868.8
+ imputed_age        1   7861.0 7869.0
+ catalog_shopper    1   7861.1 7869.1
+ veteran            1   7861.1 7869.1
+ recency            1   7861.3 7869.3

Step:  AIC=7860
donated ~ frequency + money + pet_owner

                    Df Deviance    AIC
+ bad_address        1   7844.6 7854.6
+ interest_religion  1   7847.7 7857.7
+ has_children       1   7848.8 7858.8
+ wealth_rating      3   7845.9 7859.9
<none>                   7852.0 7860.0
+ imputed_age        1   7850.8 7860.8
+ catalog_shopper    1   7851.6 7861.6
+ veteran            1   7851.8 7861.8
+ recency            1   7851.8 7861.8
+ missing_age        1   7851.9 7861.9
+ interest_veterans  1   7852.0 7862.0

Step:  AIC=7854.59
donated ~ frequency + money + pet_owner + bad_address

                    Df Deviance    AIC
+ interest_religion  1   7840.3 7852.3
+ has_children       1   7841.3 7853.3
<none>                   7844.6 7854.6
+ wealth_rating      3   7838.6 7854.6
+ imputed_age        1   7843.2 7855.2
+ catalog_shopper    1   7844.2 7856.2
+ veteran            1   7844.3 7856.3
+ recency            1   7844.4 7856.4
+ missing_age        1   7844.5 7856.5
+ interest_veterans  1   7844.6 7856.6

Step:  AIC=7852.32
donated ~ frequency + money + pet_owner + bad_address + interest_religion

                    Df Deviance    AIC
+ has_children       1   7836.6 7850.6
<none>                   7840.3 7852.3
+ wealth_rating      3   7834.4 7852.4
+ imputed_age        1   7839.0 7853.0
+ catalog_shopper    1   7839.6 7853.6
+ veteran            1   7840.1 7854.1
+ recency            1   7840.2 7854.2
+ missing_age        1   7840.3 7854.3
+ interest_veterans  1   7840.3 7854.3

Step:  AIC=7850.64
donated ~ frequency + money + pet_owner + bad_address + interest_religion + 
    has_children

                    Df Deviance    AIC
+ wealth_rating      3   7829.8 7849.8
<none>                   7836.6 7850.6
+ catalog_shopper    1   7836.0 7852.0
+ imputed_age        1   7836.2 7852.2
+ veteran            1   7836.4 7852.4
+ missing_age        1   7836.4 7852.4
+ recency            1   7836.5 7852.5
+ interest_veterans  1   7836.6 7852.6

Step:  AIC=7849.84
donated ~ frequency + money + pet_owner + bad_address + interest_religion + 
    has_children + wealth_rating

                    Df Deviance    AIC
<none>                   7829.8 7849.8
+ catalog_shopper    1   7829.1 7851.1
+ imputed_age        1   7829.4 7851.4
+ veteran            1   7829.6 7851.6
+ recency            1   7829.7 7851.7
+ interest_veterans  1   7829.8 7851.8
+ missing_age        1   7829.8 7851.8
                    
Area under the curve: 0.5986

You can use response ~ . in a formula to mean “predict the response with all variables in the data frame except the response”.

Trees

Classification trees aka decision trees break down yes/no into a series of smaller decisions. Used to find a set of if/else conditions helpful for making a prediction. Easily understood without statistics - can be useful for business strategy, esp. where transparency is important (loan application approval, etc).

Growing the tree uses “divide and conquer” it attempts to divide the data into partitions with similar values of Y.

First, look for initial split that creates the two most homogenous groups. e.g. high vs low credit scores. Then it again tries to create another split to create high vs low loan amounts. Each split results in an if-else deicision in the tree

several packages in r

rpart for recursive partitioning (divide-and-conquer), one of most widely used

library(rpart)
m <- rpart(y ~ x1 + x2, data = df, method = "class")

# predictions
p <- predict(m, test_data, type = "class")

example

# Load the rpart package
library(rpart)

# Build a lending model predicting loan outcome versus loan amount and credit score
loan_model <- rpart(outcome ~ loan_amount + credit_score, data = loans, method = "class", control = rpart.control(cp = 0))

# Make a prediction for someone with good credit
predict(loan_model, good_credit, type = "class")

#  1 
# repaid 
# Levels: default repaid

# Make a prediction for someone with bad credit
predict(loan_model, bad_credit, type = "class")

#    1 
# default 
# Levels: default repaid

plot

# Examine the loan_model object
loan_model

# Load the rpart.plot package
library(rpart.plot)

# Plot the loan_model with default settings
rpart.plot(loan_model)

# Plot the loan_model with customized settings
rpart.plot(loan_model, type = 3, box.palette = c("red", "green"), fallen.leaves = TRUE)

a more simple dividing line (see slides for images) would be a diagonal! But can’t do this with divide-and-conquer. Requires algorithm to consider two features at once. Instead, a decision tree always creates an “axis-parallel split”. This is a potential weakness of decision trees - overly complex modeling certain patterns - this is an example of overfitting. Trees can become very complex very quickly - until it can accurately class every instance correctly, or runs out of feature values to split upon. This overfits to model the noise of the data.

To avoid this, its important to simulate unseen future data by constructing a test dataset the algorith mcan’t use when growing the tree. A simple method for constructing test sets involves holding out a smal random portion of the full dataset, e.g. 75% for training, 25% for testing. If tree performs much more poorly on test set than the training set, suggests the model was overfitted.

Holding Out

# Determine the number of rows for training
nrow(loans)
0.75 * nrow(loans)

# Create a random sample of row IDs
sample_rows <- sample(nrow(loans), 0.75 * nrow(loans))

# Create the training dataset
loans_train <- loans[sample_rows, ]

# Create the test dataset
loans_test <- loans[-sample_rows, ]

# Grow a tree using all of the available applicant data
loan_model <- rpart(outcome ~ ., data = loans_train, method = "class", control = rpart.control(cp = 0))

# Make predictions on the test dataset
loans_test$pred <- predict(loan_model, loans_test, type = "class")

# Examine the confusion matrix
table(loans_test$pred, loans_test$outcome)

#           default repaid
#  default     816    602
#  repaid      597    813

# Compute the accuracy on the test dataset
mean(loans_test$pred == loans_test$outcome)
# [1] 0.5040665

Pruning strategies

Preventing tree from growing too large by intervening early is pre-pruning. Simplest example stops divide-and-conquer once tree reaches a predefined size (e.g. max number of levels). Another method requires a minimum number of obs at a node in order for a split to occur.

Of course a tree stopped too early may miss subtle or important patterns. So instead, could grow a tree overly large and then trim it - post-pruning. Node and branches with minor impacts on accuracy are removed after the fact.

Relationship between complexity and accueracy depictde on graph [slides]. Look for point at which the curve flattens - optimal point at which to prune tree (where error rate becomes statistically similar to the most complex model)

# pre-pruning with rpart

library(rpart)
prune_control <- rpart.control(maxdepth = 30, # maximum depth of tree
                               minsplit = 20) # minimum number of obs to split

m <- rpart(y ~ x1 + x2,
           data = df,
           method = "class",
           control = prune_control)

# post-pruning with rpart

m <- rpart(y ~ x1 + x2,
           data = df,
           method = "class")

plotcp(x)
# plots error rate vs model complexity

m_pruned <- prune(m, 
                  cp = 0.20) # complexity parameter, i.e. where to prune tree (identified from plot)

example, prepruning

# Grow a tree with maxdepth of 6
prune_control <- rpart.control(cp = 0, maxdepth = 6)
loan_model <- rpart(outcome ~ ., loans_train, method = "class", control = prune_control)

# Make a class prediction on the test set
loans_test$pred <- predict(loan_model, loans_test, type = "class")

# Compute the accuracy of the simpler tree
mean(loans_test$pred == loans_test$outcome)
# [1] 0.5951202

# Swap maxdepth for a minimum split of 500 
loan_model <- rpart(outcome ~ ., data = loans_train, method = "class", control = rpart.control(cp = 0, minsplit = 500))

# Run this. How does the accuracy change?
loans_test$pred <- predict(loan_model, loans_test, type = "class")
mean(loans_test$pred == loans_test$outcome)
# [1] 0.5954738

postpruning

# Grow an overly complex tree
loan_model <- rpart(outcome ~ ., loans_train, method = "class", control = rpart.control(cp = 0))

# Examine the complexity plot
plotcp(loan_model)

# Prune the tree
loan_model_pruned <- prune(loan_model, cp = 0.0014)

# Compute the accuracy of the pruned tree
loans_test$pred <- predict(loan_model_pruned, loans_test, type = "class")
mean(loans_test$pred == loans_test$outcome)
# [1] 0.6014851

Random forests

A number of classification trees can be combined into a decision tree forest. These are among the most powerful yet efficient classifiers, and popular.

Comes from collection of smaller, simpler trees that together reflect data’s complexity. Generating diversity (which captures small subtleties) is key to building powerful forest model.

Each tree should grow differently (not 100x of the same), allocate each tree a random subset of (training) data.

Machine learning methods that apply this principle are called ensemble methods: weaker learners become stronger with teamwork.

In a random forest, each tree is asked to make a prediction, and the group’s overall prediction is determined by majority vote of the trees

randomForest package

library(randomForest)

m <- randomForest(y ~ x1+ x2, data = df,
                  ntree = 500 # number of trees in the forest
                  mtry = sqrt(p)) # number of predictors (p) selected at random per tree (sqrt of total # of X's by default), ok to leave as it

# making predictions
p <- predict(m, test_data)

example

# Load the randomForest package
library(randomForest)

# Build a random forest model
loan_model <- randomForest(outcome ~. , data = loans_train)

# Compute the accuracy of the random forest
loans_test$pred <- predict(loan_model, loans_test)
mean(loans_test$pred == loans_test$outcome)
# [1] 0.5915842

Supervised Learning: Regression

Regression from a ML perspective. Scientific mindset: modeling to understand data generating process. Engineering mindset: modeling to predict accurately. ML is engineering mindset.

Statistically: expected value of Y | X’s. In a casual sense, predicting a numerical outcome (Y) given inputs (Xs).

Assumes expected outcome is weighted sum of all inputs. And assumes $Δ$ Y is linearly proportional to $Δ$ X

# unemployment is available
summary(unemployment)

# newrates is available
newrates

# Predict female unemployment in the unemployment dataset
unemployment$prediction <-  predict(unemployment_model)

# Load the ggplot2 package
library(ggplot2)

# Make a plot to compare predictions to actual (prediction on x axis)
ggplot(unemployment, aes(x = prediction, y = female_unemployment)) + 
  geom_point() +
  geom_abline(color = "blue")

# Predict female unemployment rate when male unemployment is 5%
pred <- predict(unemployment_model, newdata = newrates)
pred

Gain Curve

# unemployment (with predictions) is available
summary(unemployment)

# unemployment_model is available
summary(unemployment_model)

# Load the package WVPlots
library(WVPlots)

# Plot the Gain Curve
GainCurvePlot(unemployment, "predictions", "female_unemployment", "Unemployment model")

Root MSE

In this exercise, you will calculate the RMSE of your unemployment model. In the previous coding exercises, you added two columns to the unemployment dataset:

the model's predictions (predictions column)
the residuals between the predictions and the outcome (residuals column)

You can calculate the RMSE from a vector of residuals,

, as:

You want RMSE to be small. How small is “small”? One heuristic is to compare the RMSE to the standard deviation of the outcome. With a good model, the RMSE should be smaller.

# Print a summary of unemployment
summary(unemployment)

# For convenience put the residuals in the variable res
res <- unemployment$residuals

# Calculate RMSE, assign it to the variable rmse and print it
(rmse <- sqrt(mean(res^2)))

# Calculate the standard deviation of female_unemployment and print it
(sd_unemployment <- sd(unemployment$female_unemployment))

R^2

# unemployment is available
summary(unemployment)

# unemployment_model is available
summary(unemployment_model)

# Calculate and print the mean female_unemployment: fe_mean
(fe_mean <- mean(unemployment$female_unemployment))

# Calculate and print the total sum of squares: tss
(tss <- sum((unemployment$female_unemployment - fe_mean)^2))

# Calculate and print residual sum of squares: rss
(rss <- sum(unemployment$residuals^2))

# Calculate and print the R-squared: rsq
(rsq <- 1-(rss/tss))

# Get R-squared from glance and print it
(rsq_glance <- glance(unemployment_model)$r.squared)

The linear correlation of two variables, and , measures the strength of the linear relationship between them. When and

are respectively:

the outcomes of a regression model that minimizes squared-error (like linear regression) and
the true outcomes of the training data,

# unemployment is available
summary(unemployment)

# unemployment_model is available
summary(unemployment_model)

# Get the correlation between the prediction and true outcome: rho and print it
(rho <- cor(unemployment$predictions, unemployment$female_unemployment))

# Square rho: rho2 and print it
(rho2 <- rho^2)

# Get R-squared from glance and print it
(rsq_glance <- glance(unemployment_model)$r.squared)

Testing & Training Regression

We can fit a linear model to the training data, and then calculate RMSE and R-squared of the model for both the training and test sets. Here the model performs similarly on both sets, slightly better on training. Since the performance is not too different on test, we know the model is not overfit.

If you don’t have enough data to split into training and test sets, use cross-validation to estimate a model’s out of sample performance. In n-fold cross-validation, you partition the data into n-subsets (in the figure we use n = 3). Let’s call them A, B, and C. First, train a model using the data from sets A and B, and use that model to make predictions on C. Then train a model on B and C to predict on A. And a model on A and C to predict on B. Now, none of these models has made predictions on its own training data. All the predictions are essentially “test set” predictions. Therefore, RMSE and R-squared calculated from these predictions should give you an unbiased estimate of how a model fit to all the training data will perform on future data.

We can implement a cross-validation plan using the function kWayCrossValidation from the package vtreat. The function needs the number of rows in the training data, and the number of folds to create. In our example, we use 3 folds.

library(vtreat)
splitPlan <- kWayCrossValidation(nRows, nSplits, NULL, NULL)
# nRows: number of rows in training data
# nSplits: number of folds (partitions), e.g. 3 for 3-way
# remaining 2 args not needed here

The function returns the indices for training and testing for each fold. Use the data with the training indices to fit a model, and then make prediction on the data with the app, or test, indices.

If the estimated model performance looks good enough, then use all the data to fit a final model. You can’t evaluate this final model’s future performance, because you don’t have data to evaluate it with. Cross-validation only tests the modeling process, while test/train split tests the final model.

We can use cross-validation to estimate the out of sample performance of a model to fit female unemployment. The estimates are similar to the estimates made by using a test set. Now let’s practice

For the next several exercises you will use the mpg data from the package ggplot2. The data describes the characteristics of several makes and models of cars from different years. The goal is to predict city fuel efficiency from highway fuel efficiency.

In this exercise, you will split mpg into a training set mpg_train (75% of the data) and a test set mpg_test (25% of the data). One way to do this is to generate a column of uniform random numbers between 0 and 1, using the function runif().

If you have a dataset dframe of size N , and you want a random subset of approximately size 100 * X% of (where is between 0 and 1), then:

Generate a vector of uniform random numbers: gp = runif(N).
dframe[gp < X,] will be about the right size.
dframe[gp >= X,] will be the complement.

# mpg is available
summary(mpg)
dim(mpg)

# Use nrow to get the number of rows in mpg (N) and print it
(N <- nrow(mpg))

# Calculate how many rows 75% of N should be and print it
# Hint: use round() to get an integer
(target <- round(N * 0.75))

# Create the vector of N uniform random variables: gp
gp <- runif(N)

# Use gp to create the training set: mpg_train (75% of data) and mpg_test (25% of data)
mpg_train <- mpg[gp < 0.75, ]
mpg_test <- mpg[gp >= 0.75, ]

# Use nrow() to examine mpg_train and mpg_test
nrow(mpg_train)
nrow(mpg_test)

# mpg_train is available
summary(mpg_train)

# Create a formula to express cty as a function of hwy: fmla and print it.
(fmla <- cty ~ hwy)

# Now use lm() to build a model mpg_model from mpg_train that predicts cty from hwy 
mpg_model <- lm(fmla, data = mpg_train)

# Use summary() to examine the model
summary(mpg_model)

Now you will test the model mpg_model on the test data, mpg_test. Functions rmse() and r_squared() to calculate RMSE and R-squared have been provided for convenience:

rmse(predcol, ycol) r_squared(predcol, ycol)

where:

predcol: The predicted values
ycol: The actual outcome

You will also plot the predictions vs. the outcome.

Generally, model performance is better on the training data than the test data (though sometimes the test set “gets lucky”). A slight difference in performance is okay; if the performance on training is significantly better, there is a problem.

The mpg_train and mpg_test data frames, and the mpg_model model have been pre-loaded, along with the functions rmse() and r_squared().

# Examine the objects that have been loaded
ls.str()

# predict cty from hwy for the training set
mpg_train$pred <- predict(mpg_model, mpg_train)

# predict cty from hwy for the test set
mpg_test$pred <- predict(mpg_model, mpg_test)

# Evaluate the rmse on both training and test data and print them
(rmse_train <- rmse(mpg_train$cty, mpg_train$pred))
(rmse_test <- rmse(mpg_test$cty, mpg_test$pred))


# Evaluate the r-squared on both training and test data.and print them
(rsq_train <- r_squared(mpg_train$pred, mpg_train$cty))
(rsq_test <- r_squared(mpg_test$pred, mpg_test$cty))

# Plot the predictions (on the x-axis) against the outcome (cty) on the test data
ggplot(mpg_test, aes(x = pred, y = cty)) + 
  geom_point() + 
  geom_abline()

# Load the package vtreat
library(vtreat)

# mpg is available
summary(mpg)

# Get the number of rows in mpg
nRows <- nrow(mpg)

# Implement the 3-fold cross-fold plan with vtreat
splitPlan <- kWayCrossValidation(nRows, 3)

# Examine the split plan
splitPlan

In this exercise, you will use splitPlan, the 3-fold cross validation plan from the previous exercise, to make predictions from a model that predicts mpg $c t y f r o m m p g$ hwy.

If dframe is the training data, then one way to add a column of cross-validation predictions to the frame is as follows:

# Initialize a column of the appropriate length
dframe$pred.cv <- 0 

# k is the number of folds
# splitPlan is the cross validation plan

for(i in 1:k) {
  # Get the ith split
  split <- splitPlan[[i]]

  # Build a model on the training data 
  # from this split 
  # (lm, in this case)
  model <- lm(fmla, data = dframe[split$train,])

  # make predictions on the 
  # application data from this split
  dframe$pred.cv[split$app] <- predict(model, newdata = dframe[split$app,])
}

Cross-validation predicts how well a model built from all the data will perform on new data. As with the test/train split, for a good modeling procedure, cross-validation performance and training performance should be close.

The data frame mpg, the cross validation plan splitPlan, and the rmse() function have been pre-loaded.

# mpg is available
summary(mpg)

# splitPlan is available
str(splitPlan)

# Run the 3-fold cross validation plan from splitPlan
k <- 3 # Number of folds
mpg$pred.cv <- 0 
for(i in 1:k) {
  split <- splitPlan[[i]]
  model <- lm(cty ~ hwy, data = mpg[split$train, ])
  mpg$pred.cv[split$app] <- predict(model, newdata = mpg[split$app, ])
}

# Predict from a full model
mpg$pred <- predict(lm(cty ~ hwy, data = mpg))

# Get the rmse of the full model's predictions
rmse(mpg$pred, mpg$cty)

# Get the rmse of the cross-validation predictions
rmse(mpg$pred.cv, mpg$cty)

Congratulations! You have successfully estimated a model’s out-of-sample error via cross-validation. Remember, cross-validation validates the modeling process, not an actual model.

# alcohol is available
summary(alcohol)

# Both the formulae are available
fmla_add
fmla_interaction

# Create the splitting plan for 3-fold cross validation
set.seed(34245)  # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(alcohol), 3, NULL, NULL)

# Sample code: Get cross-val predictions for main-effects only model
alcohol$pred_add <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_add <- lm(fmla_add, data = alcohol[split$train, ])
  alcohol$pred_add[split$app] <- predict(model_add, newdata = alcohol[split$app, ])
}

# Get the cross-val predictions for the model with interactions
alcohol$pred_interaction <- 0 # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_interaction <- lm(fmla_interaction, data = alcohol[split$train, ])
  alcohol$pred_interaction[split$app] <- predict(model_interaction, newdata = alcohol[split$app, ])
}

# Get RMSE
alcohol %>% 
  gather(key = modeltype, value = pred, pred_add, pred_interaction) %>%
  mutate(residuals = Metabol - pred) %>%
  group_by(modeltype) %>%
  summarize(rmse = sqrt(mean(residuals^2)))

# fdata is available
summary(fdata)

# Examine the data: generate the summaries for the groups large and small:
fdata %>% 
    group_by(label) %>%        # group by small/large purchases
    summarize(min  = min(y),   # min of y
              mean = mean(y),  # mean of y
              max  = max(y))   # max of y

# Fill in the blanks to add error columns
fdata2 <- fdata %>% 
         group_by(label) %>%               # group by label
           mutate(residual = y - pred,     # Residual
                  relerr   = residual/y)   # Relative error

# Compare the rmse and rmse.rel of the large and small groups:
fdata2 %>% 
  group_by(label) %>% 
  summarize(rmse     = sqrt(mean(residual^2)),  # RMSE
            rmse.rel = sqrt(mean(relerr^2)))    # Root mean squared relative error
            
# Plot the predictions for both groups of purchases
ggplot(fdata2, aes(x = pred, y = y, color = label)) + 
  geom_point() + 
  geom_abline() + 
  facet_wrap(~ label, ncol = 1, scales = "free") + 
  ggtitle("Outcome vs prediction")