SlideShare a Scribd company logo
Introduction to R for Data Science
Lecturers
dipl. ing Branko Kovač
Data Analyst at CUBE/Data Science Mentor
at Springboard
Data Science zajednica Srbije
branko.kovac@gmail.com
dr Goran S. Milovanović
Data Scientist at DiploFoundation
Data Science zajednica Srbije
goran.s.milovanovic@gmail.com
goranm@diplomacy.edu
Linear Regression in R
• Exploratory Data Analysis
• Assumptions of the Linear Model
• Correlation
• Normality Tests
• Linear Regression
• Prediction, Confidence
Intervals, Residuals
• Influential Cases and
the Influence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# clear
rm(list=ls())
#### read data
library(datasets)
data(iris)
### iris data set description:
# https://stat.ethz.ch/R-manual/R-devel/library/iriss/html/iris.html
### ExploratoryData Analysis (EDA)
str(iris)
summary(iris)
Linear Regression in R
• Before modeling: Assumptions and Exploratory Data Analysis (EDA)
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
### EDA plots
# plot layout:2 x 2
par(mfcol = c(2,2))
# boxplotiris$Sepal.Length
boxplot(iris$Sepal.Length,
horizontal = TRUE,
xlab="Sepal Length")
# histogram:iris$Sepal.Length
hist(iris$Sepal.Length,
main="",
xlab="Sepal.Length",
prob=T)
# overlay iris$Sepal.Length density functionoverthe empiricaldistribution
lines(density(iris$Sepal.Length),
lty="dashed",
lwd=2.5,
col="red")
EDA
Intro to R for Data Science
Session 6: Linear Regression in R
Linear Regression in R
• EDA
Intro to R for Data Science
Session 6: Linear Regression in R
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
## Pearsoncorrelationin R {base}
cor1 <- cor(iris$Sepal.Length, iris$Petal.Length,
method="pearson")
cor1
par(mfcol = c(1,1))
plot(iris$Sepal.Length, iris$Petal.Length,
main = "Sepal Length vs Petal Length",
xlab = "Sepal Length", ylab = "Petal Length")
## Correlation matrix and treatmentof missing data
dSet <- iris
# Remove one discretevariable
dSet$Species <- NULL
# introduce NA in dSet$Sepal.Length[5]
dSet$Sepal.Length[5] <- NA
# Pairwise and Listwise Deletion:
cor1a <- cor(dSet,use="complete.obs") # listwise deletion
cor1a <- cor(dSet,use="pairwise.complete.obs") # pairwise deletion
cor1a <- cor(dSet,use="all.obs") # all observations -error
Correlation
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
library(Hmisc)
cor2 <- rcorr(iris$Sepal.Length,
iris$Petal.Length,
type="pearson")
cor2$r # correlations
cor2$r[1,2] # that's whatyou need,right
cor2$P # significantat
cor2$n # num.observations
# NOTE: rcorr uses Pairwise deletion!
# Correlation matrix
cor2a <- rcorr(as.matrix(dSet),
type="pearson") # NOTE:as.matrix
# select significant at alpha == .05
w <- which(!(cor2a$P<.05),arr.ind = T)
cor2a$r[w] <- NA
cor2a$P # comparew.
cor2a$r
Correlation {Hmisc}
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# LinearRegression:lm()
# Predicting:PetalLength from SepalLength
reg <- lm(Petal.Length ~ Sepal.Length, data=iris)
class(reg)
summary(reg)
coefsReg <- coefficients(reg)
coefsReg
slopeReg <- coefsReg[2]
interceptReg <- coefsReg[1]
# Prediction from this model
newSLength <- data.frame(Sepal.Length = runif(100,
min(iris$Sepal.Length),
max(iris$Sepal.Length))
) # watch the variable namesin the new data.frame!
predictPLength <- predict(reg, newSLength)
predictPLength
Linear Regression with lm()
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# Standardizedregressioncoefficients {QuantPsych}
library(QuantPsyc)
lm.beta(reg)
# Reminder:standardizedregressioncoefficientsare...
# What you would obtain upon performinglinearregressionoverstandardizedvariables
# z-score in R
zSLength <- scale(iris$Sepal.Length, center = T, scale = T) # computes z-score
zPLength <- scale(iris$Petal.Length, center = T, scale = T) # again;?scale
# new dSetw. standardized variables
dSet <- data.frame(Sepal.Length <- zSLength,
Petal.Length <- zPLength)
# LinearRegression w.lm() overstandardized variables
reg1 <- lm(Petal.Length ~ Sepal.Length, data=dSet)
summary(reg1)
# compare
coefficients(reg1)[2] # beta from reg1
lm.beta(reg) # standardizedbeta w. QuantPscy lm.beta from reg
Standardized Regression Coefficients
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# plots w. {base}and {ggplot2}
library(ggplot2)
# Predictorvs Criterion {base}
plot(iris$Sepal.Length, iris$Petal.Length,
main = "Petal Length vs Sepal Length",
xlab = "Sepal Length",
ylab = "Petal Length"
)
abline(reg,col="red")
# Predictorvs Criterion {ggplot2}
ggplot(data = iris,
aes(x = Sepal.Length, y = Petal.Length)) +
geom_point(size = 2, colour = "black") +
geom_point(size = 1, colour = "white") +
geom_smooth(aes(colour = "red"),
method='lm') +
ggtitle("Sepal Length vs Petal Length") +
xlab("Sepal Length") + ylab("Petal Length") +
theme(legend.position = "none")
Plots {base} vs {ggplot2}
Intro to R for Data Science
Session 6: Linear Regression in R
Plots {base} vs {ggplot2}
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
# Predicted vs.residuals {ggplot2}
predReg <- predict(reg) # get predictions from reg
resReg <- residuals(reg) # get residuals from reg
# resStReg <- rstandard(reg)# get residualsfrom reg
plotFrame <- data.frame(predicted = predReg,
residual = resReg);
ggplot(data = plotFrame,
aes(x = predicted, y = residual)) +
geom_point(size = 2, colour = "black") +
geom_point(size = 1, colour = "white") +
geom_smooth(aes(colour = "blue"),
method='lm',
se=F) +
ggtitle("Predicted vs Residual Lengths") +
xlab("Predicted Lengths") + ylab("Residual") +
theme(legend.position = "none")
Predicted vs Residuals
Intro to R for Data Science
Session 6: Linear Regression in R
Predicted vs Residuals
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
## Detectinfluentialcases
infReg <- as.data.frame(influence.measures(reg)$infmat)
# Cook's Distance:Cook and Weisberg(1982):
# values greaterthan 1 are troublesome
wCook <- which(infReg$cook.d>1) # we're fine here
# Average Leverage = (k+1)/n,k - num. of predictors,n - num. observations
# Also termed:hat values,range:0 - 1
# see: https://en.wikipedia.org/wiki/Leverage_%28statistics%29
# Various criteria (twice the leverage,three times the average...)
wLev <- which(infReg$hat>2*(2/length(iris$price))) # we seem to be fine here too...
## Influenceplot
infReg <- as.data.frame(influence.measures(reg)$infmat)
plotFrame <- data.frame(residual = resStReg,
leverage = infReg$hat,
cookD = infReg$cook.d)
Infulential Cases + Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
# Introduction to R for Data Science
# SESSION 6 :: 02 June, 2016
ggplot(plotFrame,
aes(y = residual,
x = leverage)) +
geom_point(size = plotFrame$cookD*100, shape = 1) +
ggtitle("Influence PlotnSize of the circle corresponds to Cook's distance") +
theme(plot.title = element_text(size=8, face="bold")) +
ylab("Standardized Residual") + xlab("Leverage")
Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
Infulence Plot
Intro to R for Data Science
Session 6: Linear Regression in R
Introduction to R for Data Science :: Session 6 [Linear Regression in R]

More Related Content

PDF
Introduction to R for Data Science :: Session 5 [Data Structuring: Strings in R]
PDF
Introduction to R for Data Science :: Session 7 [Multiple Linear Regression i...
PDF
Introduction to R for Data Science :: Session 8 [Intro to Text Mining in R, M...
PDF
Introduction to R for Data Science :: Session 4
PDF
Introduction to R for Data Science :: Session 2
PDF
Introduction to R for Data Science :: Session 3
PDF
Introduction to R for Data Science :: Session 1
PDF
Introduction to R
Introduction to R for Data Science :: Session 5 [Data Structuring: Strings in R]
Introduction to R for Data Science :: Session 7 [Multiple Linear Regression i...
Introduction to R for Data Science :: Session 8 [Intro to Text Mining in R, M...
Introduction to R for Data Science :: Session 4
Introduction to R for Data Science :: Session 2
Introduction to R for Data Science :: Session 3
Introduction to R for Data Science :: Session 1
Introduction to R

What's hot (20)

PDF
Data Analysis with R (combined slides)
PDF
RDataMining slides-text-mining-with-r
PPTX
R language
PDF
Introduction to source{d} Engine and source{d} Lookout
PDF
Text mining and social network analysis of twitter data part 1
PPTX
Training in Analytics, R and Social Media Analytics
PDF
Machine Learning in R
PPTX
Hybrid acquisition of temporal scopes for rdf data
PPTX
C programming
PPTX
Text analytics in Python and R with examples from Tobacco Control
PDF
Detecting paraphrases using recursive autoencoders
PDF
Recursive Autoencoders for Paraphrase Detection (Socher et al)
PDF
Stack Algorithm
PPT
SPARQL in a nutshell
PPTX
A brief introduction to lisp language
PDF
Rbootcamp Day 1
PDF
my$talk=qr{((?:ir)?reg(?:ular )?exp(?:ressions?)?)}i;
PPTX
Natural Language Processing in R (rNLP)
PDF
Federation and Navigation in SPARQL 1.1
PPTX
Reproducible Computational Research in R
Data Analysis with R (combined slides)
RDataMining slides-text-mining-with-r
R language
Introduction to source{d} Engine and source{d} Lookout
Text mining and social network analysis of twitter data part 1
Training in Analytics, R and Social Media Analytics
Machine Learning in R
Hybrid acquisition of temporal scopes for rdf data
C programming
Text analytics in Python and R with examples from Tobacco Control
Detecting paraphrases using recursive autoencoders
Recursive Autoencoders for Paraphrase Detection (Socher et al)
Stack Algorithm
SPARQL in a nutshell
A brief introduction to lisp language
Rbootcamp Day 1
my$talk=qr{((?:ir)?reg(?:ular )?exp(?:ressions?)?)}i;
Natural Language Processing in R (rNLP)
Federation and Navigation in SPARQL 1.1
Reproducible Computational Research in R
Ad

Viewers also liked (18)

DOCX
Latest seo news, tips and tricks website lists
PPSX
Electron Configuration
PDF
Uvod u R za Data Science :: Sesija 1 [Intro to R for Data Science :: Session 1]
KEY
Benefits of Short Term Contract Hire
PPTX
Building a Scalable Data Science Platform with R
PDF
Slides erm-cea-ia
PDF
IA-advanced-R
PDF
Slides ads ia
PDF
Classification
PDF
Slides lln-risques
PDF
15 03 16_data sciences pour l'actuariat_f. soulie fogelman
PPTX
Building a Graph Database in Neo4j with Spark & Spark SQL to gain new insight...
PDF
Slides barcelona Machine Learning
PDF
12 Hidden Tips of Popular Remote Work Tools
PDF
Freelance@toptal
PDF
Spatial Data Science with R
PDF
Actuarial Analytics in R
PPTX
TextMining with R
Latest seo news, tips and tricks website lists
Electron Configuration
Uvod u R za Data Science :: Sesija 1 [Intro to R for Data Science :: Session 1]
Benefits of Short Term Contract Hire
Building a Scalable Data Science Platform with R
Slides erm-cea-ia
IA-advanced-R
Slides ads ia
Classification
Slides lln-risques
15 03 16_data sciences pour l'actuariat_f. soulie fogelman
Building a Graph Database in Neo4j with Spark & Spark SQL to gain new insight...
Slides barcelona Machine Learning
12 Hidden Tips of Popular Remote Work Tools
Freelance@toptal
Spatial Data Science with R
Actuarial Analytics in R
TextMining with R
Ad

Similar to Introduction to R for Data Science :: Session 6 [Linear Regression in R] (20)

PPTX
Rattle Graphical Interface for R Language
PDF
Introduction to R for data science
PDF
Extending lifespan with Hadoop and R
PPT
Rtutorial
PPTX
R seminar dplyr package
PDF
Gsas intro rvd (1)
PPTX
lecture-Basic-programing-R-1-basic-eng.pptx
PPTX
DATA MINING USING R (1).pptx
PPT
Language Technology Enhanced Learning
PDF
3rd Athens Big Data Meetup - 2nd Talk - Neo4j: The World's Leading Graph DB
PDF
User biglm
PDF
Compiling openCypher graph queries with Spark Catalyst
PPTX
R Language Introduction
PDF
An Introduction to Data Mining with R
PDF
R programming & Machine Learning
PPTX
Data analysis with R
ODP
Introduction to R
PPT
introtorandrstudio.ppt
PPT
Hands on data science with r.pptx
PDF
R Programming - part 1.pdf
Rattle Graphical Interface for R Language
Introduction to R for data science
Extending lifespan with Hadoop and R
Rtutorial
R seminar dplyr package
Gsas intro rvd (1)
lecture-Basic-programing-R-1-basic-eng.pptx
DATA MINING USING R (1).pptx
Language Technology Enhanced Learning
3rd Athens Big Data Meetup - 2nd Talk - Neo4j: The World's Leading Graph DB
User biglm
Compiling openCypher graph queries with Spark Catalyst
R Language Introduction
An Introduction to Data Mining with R
R programming & Machine Learning
Data analysis with R
Introduction to R
introtorandrstudio.ppt
Hands on data science with r.pptx
R Programming - part 1.pdf

More from Goran S. Milovanovic (20)

PDF
Geneva Social Media Index - Report 2015 full report
PDF
Milovanović, G.S., Krstić, M. & Filipović, O. (2015). Kršenje homogenosti pre...
PDF
247113920-Cognitive-technologies-mapping-the-Internet-governance-debate
PDF
Učenje i viši kognitivni procesi 10. Simboličke funkcije, VI Deo: Rešavanje p...
PDF
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Rezonovanje u...
PDF
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Suđenje, heur...
PDF
Učenje i viši kognitivni procesi 8. Simboličke funkcije, IV Deo: Analogija i ...
PDF
Učenje i viši kognitivni procesi 9. Simboličke funkcije, III Deo: Kauzalnost,...
PDF
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Distribuiran...
PDF
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Konekcioniza...
PDF
Učenje i viši kognitivni procesi 7a. Simboličke funkcije, I Deo: Učenje kateg...
PDF
Učenje i viši kognitivni procesi 7. Simboličke funkcije, I Deo: Koncepti, kat...
PDF
Učenje i viši kognitivni procesi 7. Učenje, IV Deo: Neasocijativno učenje, ef...
PDF
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Hernstejnov zakon slagan...
PDF
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Instrumentalno učenje
PDF
Učenje i viši kognitivni procesi 5. Učenje, II Deo: Blokiranje, osenčavanje, ...
PDF
Učenje i viši kognitivni procesi 5. Učenje, II Deo: klasično uslovljavanje i ...
PDF
Učenje i viši kognitivni procesi 5. Učenje, I Deo
PDF
Učenje i viši kognitivni procesi 4a. Debata o racionalnosti, nastavak
PDF
Učenje i viši kognitivni procesi 4. Debata o racionalnosti
Geneva Social Media Index - Report 2015 full report
Milovanović, G.S., Krstić, M. & Filipović, O. (2015). Kršenje homogenosti pre...
247113920-Cognitive-technologies-mapping-the-Internet-governance-debate
Učenje i viši kognitivni procesi 10. Simboličke funkcije, VI Deo: Rešavanje p...
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Rezonovanje u...
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Suđenje, heur...
Učenje i viši kognitivni procesi 8. Simboličke funkcije, IV Deo: Analogija i ...
Učenje i viši kognitivni procesi 9. Simboličke funkcije, III Deo: Kauzalnost,...
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Distribuiran...
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Konekcioniza...
Učenje i viši kognitivni procesi 7a. Simboličke funkcije, I Deo: Učenje kateg...
Učenje i viši kognitivni procesi 7. Simboličke funkcije, I Deo: Koncepti, kat...
Učenje i viši kognitivni procesi 7. Učenje, IV Deo: Neasocijativno učenje, ef...
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Hernstejnov zakon slagan...
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Instrumentalno učenje
Učenje i viši kognitivni procesi 5. Učenje, II Deo: Blokiranje, osenčavanje, ...
Učenje i viši kognitivni procesi 5. Učenje, II Deo: klasično uslovljavanje i ...
Učenje i viši kognitivni procesi 5. Učenje, I Deo
Učenje i viši kognitivni procesi 4a. Debata o racionalnosti, nastavak
Učenje i viši kognitivni procesi 4. Debata o racionalnosti

Recently uploaded (20)

PDF
Chapter 2 Heredity, Prenatal Development, and Birth.pdf
PPTX
Pharmacology of Heart Failure /Pharmacotherapy of CHF
PDF
English Language Teaching from Post-.pdf
PDF
Pre independence Education in Inndia.pdf
PPTX
The Healthy Child – Unit II | Child Health Nursing I | B.Sc Nursing 5th Semester
PPTX
Open Quiz Monsoon Mind Game Final Set.pptx
PDF
FourierSeries-QuestionsWithAnswers(Part-A).pdf
PDF
The Lost Whites of Pakistan by Jahanzaib Mughal.pdf
PPTX
Open Quiz Monsoon Mind Game Prelims.pptx
PDF
Microbial disease of the cardiovascular and lymphatic systems
PPTX
Introduction_to_Human_Anatomy_and_Physiology_for_B.Pharm.pptx
PPTX
Renaissance Architecture: A Journey from Faith to Humanism
PDF
Electrolyte Disturbances and Fluid Management A clinical and physiological ap...
PDF
O5-L3 Freight Transport Ops (International) V1.pdf
PDF
Open folder Downloads.pdf yes yes ges yes
PPTX
Microbial diseases, their pathogenesis and prophylaxis
PPTX
IMMUNITY IMMUNITY refers to protection against infection, and the immune syst...
PDF
ANTIBIOTICS.pptx.pdf………………… xxxxxxxxxxxxx
PDF
Business Ethics Teaching Materials for college
PPTX
Introduction and Scope of Bichemistry.pptx
Chapter 2 Heredity, Prenatal Development, and Birth.pdf
Pharmacology of Heart Failure /Pharmacotherapy of CHF
English Language Teaching from Post-.pdf
Pre independence Education in Inndia.pdf
The Healthy Child – Unit II | Child Health Nursing I | B.Sc Nursing 5th Semester
Open Quiz Monsoon Mind Game Final Set.pptx
FourierSeries-QuestionsWithAnswers(Part-A).pdf
The Lost Whites of Pakistan by Jahanzaib Mughal.pdf
Open Quiz Monsoon Mind Game Prelims.pptx
Microbial disease of the cardiovascular and lymphatic systems
Introduction_to_Human_Anatomy_and_Physiology_for_B.Pharm.pptx
Renaissance Architecture: A Journey from Faith to Humanism
Electrolyte Disturbances and Fluid Management A clinical and physiological ap...
O5-L3 Freight Transport Ops (International) V1.pdf
Open folder Downloads.pdf yes yes ges yes
Microbial diseases, their pathogenesis and prophylaxis
IMMUNITY IMMUNITY refers to protection against infection, and the immune syst...
ANTIBIOTICS.pptx.pdf………………… xxxxxxxxxxxxx
Business Ethics Teaching Materials for college
Introduction and Scope of Bichemistry.pptx

Introduction to R for Data Science :: Session 6 [Linear Regression in R]

  • 1. Introduction to R for Data Science Lecturers dipl. ing Branko Kovač Data Analyst at CUBE/Data Science Mentor at Springboard Data Science zajednica Srbije [email protected] dr Goran S. Milovanović Data Scientist at DiploFoundation Data Science zajednica Srbije [email protected] [email protected]
  • 2. Linear Regression in R • Exploratory Data Analysis • Assumptions of the Linear Model • Correlation • Normality Tests • Linear Regression • Prediction, Confidence Intervals, Residuals • Influential Cases and the Influence Plot Intro to R for Data Science Session 6: Linear Regression in R
  • 3. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 # clear rm(list=ls()) #### read data library(datasets) data(iris) ### iris data set description: # https://stat.ethz.ch/R-manual/R-devel/library/iriss/html/iris.html ### ExploratoryData Analysis (EDA) str(iris) summary(iris) Linear Regression in R • Before modeling: Assumptions and Exploratory Data Analysis (EDA) Intro to R for Data Science Session 6: Linear Regression in R
  • 4. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 ### EDA plots # plot layout:2 x 2 par(mfcol = c(2,2)) # boxplotiris$Sepal.Length boxplot(iris$Sepal.Length, horizontal = TRUE, xlab="Sepal Length") # histogram:iris$Sepal.Length hist(iris$Sepal.Length, main="", xlab="Sepal.Length", prob=T) # overlay iris$Sepal.Length density functionoverthe empiricaldistribution lines(density(iris$Sepal.Length), lty="dashed", lwd=2.5, col="red") EDA Intro to R for Data Science Session 6: Linear Regression in R
  • 5. Linear Regression in R • EDA Intro to R for Data Science Session 6: Linear Regression in R
  • 6. Intro to R for Data Science Session 6: Linear Regression in R
  • 7. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 ## Pearsoncorrelationin R {base} cor1 <- cor(iris$Sepal.Length, iris$Petal.Length, method="pearson") cor1 par(mfcol = c(1,1)) plot(iris$Sepal.Length, iris$Petal.Length, main = "Sepal Length vs Petal Length", xlab = "Sepal Length", ylab = "Petal Length") ## Correlation matrix and treatmentof missing data dSet <- iris # Remove one discretevariable dSet$Species <- NULL # introduce NA in dSet$Sepal.Length[5] dSet$Sepal.Length[5] <- NA # Pairwise and Listwise Deletion: cor1a <- cor(dSet,use="complete.obs") # listwise deletion cor1a <- cor(dSet,use="pairwise.complete.obs") # pairwise deletion cor1a <- cor(dSet,use="all.obs") # all observations -error Correlation Intro to R for Data Science Session 6: Linear Regression in R
  • 8. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 library(Hmisc) cor2 <- rcorr(iris$Sepal.Length, iris$Petal.Length, type="pearson") cor2$r # correlations cor2$r[1,2] # that's whatyou need,right cor2$P # significantat cor2$n # num.observations # NOTE: rcorr uses Pairwise deletion! # Correlation matrix cor2a <- rcorr(as.matrix(dSet), type="pearson") # NOTE:as.matrix # select significant at alpha == .05 w <- which(!(cor2a$P<.05),arr.ind = T) cor2a$r[w] <- NA cor2a$P # comparew. cor2a$r Correlation {Hmisc} Intro to R for Data Science Session 6: Linear Regression in R
  • 9. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 # LinearRegression:lm() # Predicting:PetalLength from SepalLength reg <- lm(Petal.Length ~ Sepal.Length, data=iris) class(reg) summary(reg) coefsReg <- coefficients(reg) coefsReg slopeReg <- coefsReg[2] interceptReg <- coefsReg[1] # Prediction from this model newSLength <- data.frame(Sepal.Length = runif(100, min(iris$Sepal.Length), max(iris$Sepal.Length)) ) # watch the variable namesin the new data.frame! predictPLength <- predict(reg, newSLength) predictPLength Linear Regression with lm() Intro to R for Data Science Session 6: Linear Regression in R
  • 10. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 # Standardizedregressioncoefficients {QuantPsych} library(QuantPsyc) lm.beta(reg) # Reminder:standardizedregressioncoefficientsare... # What you would obtain upon performinglinearregressionoverstandardizedvariables # z-score in R zSLength <- scale(iris$Sepal.Length, center = T, scale = T) # computes z-score zPLength <- scale(iris$Petal.Length, center = T, scale = T) # again;?scale # new dSetw. standardized variables dSet <- data.frame(Sepal.Length <- zSLength, Petal.Length <- zPLength) # LinearRegression w.lm() overstandardized variables reg1 <- lm(Petal.Length ~ Sepal.Length, data=dSet) summary(reg1) # compare coefficients(reg1)[2] # beta from reg1 lm.beta(reg) # standardizedbeta w. QuantPscy lm.beta from reg Standardized Regression Coefficients Intro to R for Data Science Session 6: Linear Regression in R
  • 11. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 # plots w. {base}and {ggplot2} library(ggplot2) # Predictorvs Criterion {base} plot(iris$Sepal.Length, iris$Petal.Length, main = "Petal Length vs Sepal Length", xlab = "Sepal Length", ylab = "Petal Length" ) abline(reg,col="red") # Predictorvs Criterion {ggplot2} ggplot(data = iris, aes(x = Sepal.Length, y = Petal.Length)) + geom_point(size = 2, colour = "black") + geom_point(size = 1, colour = "white") + geom_smooth(aes(colour = "red"), method='lm') + ggtitle("Sepal Length vs Petal Length") + xlab("Sepal Length") + ylab("Petal Length") + theme(legend.position = "none") Plots {base} vs {ggplot2} Intro to R for Data Science Session 6: Linear Regression in R
  • 12. Plots {base} vs {ggplot2} Intro to R for Data Science Session 6: Linear Regression in R
  • 13. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 # Predicted vs.residuals {ggplot2} predReg <- predict(reg) # get predictions from reg resReg <- residuals(reg) # get residuals from reg # resStReg <- rstandard(reg)# get residualsfrom reg plotFrame <- data.frame(predicted = predReg, residual = resReg); ggplot(data = plotFrame, aes(x = predicted, y = residual)) + geom_point(size = 2, colour = "black") + geom_point(size = 1, colour = "white") + geom_smooth(aes(colour = "blue"), method='lm', se=F) + ggtitle("Predicted vs Residual Lengths") + xlab("Predicted Lengths") + ylab("Residual") + theme(legend.position = "none") Predicted vs Residuals Intro to R for Data Science Session 6: Linear Regression in R
  • 14. Predicted vs Residuals Intro to R for Data Science Session 6: Linear Regression in R
  • 15. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 ## Detectinfluentialcases infReg <- as.data.frame(influence.measures(reg)$infmat) # Cook's Distance:Cook and Weisberg(1982): # values greaterthan 1 are troublesome wCook <- which(infReg$cook.d>1) # we're fine here # Average Leverage = (k+1)/n,k - num. of predictors,n - num. observations # Also termed:hat values,range:0 - 1 # see: https://en.wikipedia.org/wiki/Leverage_%28statistics%29 # Various criteria (twice the leverage,three times the average...) wLev <- which(infReg$hat>2*(2/length(iris$price))) # we seem to be fine here too... ## Influenceplot infReg <- as.data.frame(influence.measures(reg)$infmat) plotFrame <- data.frame(residual = resStReg, leverage = infReg$hat, cookD = infReg$cook.d) Infulential Cases + Infulence Plot Intro to R for Data Science Session 6: Linear Regression in R
  • 16. # Introduction to R for Data Science # SESSION 6 :: 02 June, 2016 ggplot(plotFrame, aes(y = residual, x = leverage)) + geom_point(size = plotFrame$cookD*100, shape = 1) + ggtitle("Influence PlotnSize of the circle corresponds to Cook's distance") + theme(plot.title = element_text(size=8, face="bold")) + ylab("Standardized Residual") + xlab("Leverage") Infulence Plot Intro to R for Data Science Session 6: Linear Regression in R
  • 17. Infulence Plot Intro to R for Data Science Session 6: Linear Regression in R