SlideShare a Scribd company logo
Introduction to R for Data Science
Lecturers
dipl. ing Branko Kovač
Data Analyst at CUBE/Data Science Mentor
at Springboard
Data Science zajednica Srbije
branko.kovac@gmail.com
dr Goran S. Milovanović
Data Scientist at DiploFoundation
Data Science zajednica Srbije
goran.s.milovanovic@gmail.com
goranm@diplomacy.edu
Strings in R
• {base} for strings
• {stringr} for strings
• {stringi} for strings
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Processing strings in R
library(stringr)
# strings in R are charactervectors
stringA <- "Hello world"
stringB <- "Sun shines!"
stringA
stringB
is.character(stringA) # TRUE
as.character(200*5)
as.numeric("1000")
as.double("3.14")
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Using " and '
# either:
stringA <- "Hello 'World'"
stringA
# or
stringA <- 'Hello "World"'
stringA # prints:"Hello "World"" - what is
this:  ?
print(stringA)
# try:
writeLines(stringA)
print(stringA)
# Escapingin R: use , the R escape
character
stringA <- 'Hello "World"'
stringA
print(stringA)
writeLines(stringA)
# Escapingescaping
writeLines("")# nice
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# String Concatenationin R
stringC <- c(stringA,stringB) # a character
vectorof length == 2
length(stringC)
stringC <- paste(stringA,stringB,
sep=",") # length == 1, base
function
writeLines(stringC)
# sep w. collapse (paste args)
stringC <- c(stringA,stringB)
stringC <- paste(stringC,collapse="__")
writeLines(stringC)
# paste0 is paste w. sep="",fasterthan
paste(),base function
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
strA <- "One"
strB <- "Two"
strC <- "Three"
paste0(strA,strB, strC)
# the collapse argumentis used in paste0 as well
strD <- c(strA,strB,strC)
paste0(strD,collapse="-")
# stringr concatenation,also has sep and collapse
as args
str_c(strA,strB,strC)
str_c(strA,strB,strC,sep="...")
str_c(strD,collapse="...")
# both paste {base}and str_c {stringr} are
vectorized
paste("Prefix-",strD, sep="-")
str_c("Prefix-",strD,sep="-")
Strings in R
• Concatenation
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
stringA <- "The quick brown fox jumps overthe lazy dog";
splitA <- strsplit(stringA," ") # is.list(splitA) == T
splitA <- unlist(strsplit(stringA," "))
# "The quick brown" from "The quick brown fox jumps overthe lazy dog"
splitA <- paste(unlist(strsplit(stringA," "))[1:3],collapse=" ")
# or
splitA <- paste(strsplit(stringA," ")[[1]][1:3],collapse=" ")
# advice:use
splitA <- strsplit(stringA," ",fixed=T) # fixed=T says:match the split argumentexactly,
# otherwise,split is an regularexpression;defaultis: fixed = FALSE
# string split w. {stringr}
is.list(str_split(stringA," "))
# this is interesting:
str_split(stringA," ", n=3)
# "The quick brown" from "The quick brown fox jumps overthe lazy dog"
paste0(str_split(stringA," ", n=4)[[1]][1:3],collapse=" ")
Strings in R
• Splitting
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# default: str_split(string,pattern,n = Inf), where pattern is regex
str_split(stringA,boundary("word"))
# very useful:
stringA1 <- "The quick brown fox jumps overthe lazy dog"
str_split(stringA1,boundary("word"))
stringA1 <- "Aboveall, don'tlie to yourself.
The man who lies to himselfand listens to his own lie comes to a pointthat he cannotdistinguish the
truth within him, or around him,and so loses all respectfor himselfand for others.
And having no respecthe ceasesto love."
str_split(stringA1,boundary("word"))
str_split(stringA1,boundary("word",skip_word_none= F)) # includingpunctuation and special
str_split(stringA1,boundary("line_break"))
writeLines(str_split(stringA1,boundary("line_break"))[[1]])
Strings in R
• Splitting
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
stringA <- c("Belgrade","Zagreb","Ljubljana")# {stringr}
str_sub(stringA,1, 2)
# counting backwards
str_sub(stringA, -3, -1)
# {base}
substr(stringA,1, 3)
# play:
substr(stringA,c(1,2,3),c(2,3,4))
# nope:
substr(stringA, -2, -1) # {base}
Strings in R
• Subsetting strings
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Replacingcharactersin strings
stringB <- stringA # just a copy of stringA
str_sub(stringB,1,2)<- "00"
stringB
# {base}
stringB <- stringA # just a copy of stringA
substr(stringB,1,3)<- "WowWow" # check the
result!
stringB
substr(stringB,1,4)<- "WoWWow" # check the
result!
stringB
substr(stringB,1,6)<- "WowWow" # check the
result!
stringB
Strings in R
• Subsetting strings
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# UPPER CASE to lower case and vice versa
in R
stringA <- "ABRACADABRA"
# {base}
tolower(stringA)
stringA <- tolower(stringA)
toupper(stringA)
stringA <- toupper(stringA)
# {stringr}
str_to_lower(stringA)
stringB <- str_to_lower(stringA)
str_to_upper(stringA)
# capitalize first letter
str_to_title(stringB)
• Transforming strings
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Remove whitespace
stringA <- c(" Removewhitespace ");
str_trim(stringA)
# remove leading whitespace
str_trim(stringA,side="left")
# remove trailing whitespace
str_trim(stringA,side="right")
# remove all whitespace?
stringA <- c(" Remove whitespace ") # how aboutthis one?
# there are differentways to do it. Try:
gsub(" ", "", stringA,fixed=T) # (!(fixed==T)),the first (pattern) argumentis regex
# in general:
stringA <- "The quick brown fox jumps overthe lazy dog The quick brown"
gsub("Thequick brown","The slow red", stringA,fixed=T)
Strings in R
• More transforming
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Searchingfor somethingin a string
# Does a string encompass a substring?
grepl("Thequick brown",stringA,fixed = T)
grepl("Thefastred", stringA, fixed = T)
stringB <- "Uraaaaaaaa"
grep("Thequick brown",c(stringA,stringB),fixed = T)
# where?
stringA <- "The quick brown fox jumps overthe lazy dog The quick brown"
w <- gregexpr("Thequick brown",stringA)
str(w)
b1 <- w[[1]][1] # first match starts at
b2 <- w[[1]][2] # second match starts at
# now, match.length is an attribute of w[[1]], not w itself:
e1 <- attr(w[[1]],"match.length",exact= T)[1]
e2 <- attr(w[[1]],"match.length",exact= T)[2]
Strings in R
• Search
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# first match extraction:
str_sub(stringA,b1,b1+e1-1)
# second matchextraction:
str_sub(stringA,b2,b2+e2-1)
# Ok, but easierand more convenientwith {stringr}
str_detect(stringA,"The quickbrown") # T or F
str_locate(stringA,"The quickbrown") # first match
str_locate_all(stringA,"The quickbrown") # all matches
# term frequency,as we know,is very importantin text-mining:
term1 <- str_locate_all(stringA,"The quickbrown")[[1]]# all matches for term1 ie. "The quick
brown"
dim(term1)[1] # how many matches = how many rows in the str_locate_alloutputmatrix...
Strings in R
• Search
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Sorting strings in R
letters
str_sort(letters,locale="en")# locale = en
str_sort(letters,locale="haw")# locale = Hawaiian
# backwards
str_sort(letters,decreasing= T)
# handy:
stringA <- c("New York","Paris",NA, "Moscow","Tokyo")
str_sort(stringA,na_last=T)
# [1] "Moscow" "New York" "Paris" "Tokyo" NA
str_sort(stringA,na_last=F)
# [1] NA "Moscow" "New York" "Paris" "Tokyo"
# {base}
sort(stringA)
sort(stringA,decreasing=T)
Strings in R
• Sorting strings
Intro to R for Data Science
Session 5: Structuring Data: Strings in R
# Introduction to R for Data Science
# SESSION 5 :: 26 May, 2016
# Take home messageon encodings
# 1. Most of the time, you simply need to know the source encoding
# 2. All of the time *** converteverythingto UTF-8*** - as soon as possible
# 3. Most {base},and all {stringr} and {stringi} functions thatprocessstrings in R
# will converttheir outputto UTF-8 automatically
# Working inside R only, running an English locale,will nevercause you any trouble
# However,in Data Science you will probably needto do a lot of web-scraping fora living
# - and that's where the fan starts.
# God bless iconv()- but don'tget to excited,it does not avoid all problems
# Next session:Thurday,June2, 2016 :: LinearRegressionw. R
Strings in R
• Encodings…
Introduction to R for Data Science :: Session 5 [Data Structuring: Strings in R]

More Related Content

What's hot (20)

PDF
Introduction to Data Mining with R and Data Import/Export in R
Yanchang Zhao
 
PDF
RDataMining slides-text-mining-with-r
Yanchang Zhao
 
PDF
Introduction to source{d} Engine and source{d} Lookout
source{d}
 
PPTX
R language
LearningTech
 
PDF
Text mining and social network analysis of twitter data part 1
Johan Blomme
 
PDF
my$talk=qr{((?:ir)?reg(?:ular )?exp(?:ressions?)?)}i;
dankogai
 
PPTX
2. R-basics, Vectors, Arrays, Matrices, Factors
krishna singh
 
PPTX
Wireless sensor network Apriori an N-RMP
Amrit Khandelwal
 
PPTX
Python pandas Library
Md. Sohag Miah
 
PPTX
Text analytics in Python and R with examples from Tobacco Control
Ben Healey
 
PDF
Future features for openCypher: Schema, Constraints, Subqueries, Configurable...
openCypher
 
PDF
Introduction to R Programming
izahn
 
PPTX
Hybrid acquisition of temporal scopes for rdf data
Anisa Rula
 
PPTX
Training in Analytics, R and Social Media Analytics
Ajay Ohri
 
PPTX
Natural Language Processing in R (rNLP)
fridolin.wild
 
PDF
defense
Qing Dou
 
PDF
Stack Algorithm
Kamal Singh Lodhi
 
PDF
Managing large datasets in R – ff examples and concepts
Ajay Ohri
 
PPTX
Merge Multiple CSV in single data frame using R
Yogesh Khandelwal
 
PDF
January 2016 Meetup: Speeding up (big) data manipulation with data.table package
Zurich_R_User_Group
 
Introduction to Data Mining with R and Data Import/Export in R
Yanchang Zhao
 
RDataMining slides-text-mining-with-r
Yanchang Zhao
 
Introduction to source{d} Engine and source{d} Lookout
source{d}
 
R language
LearningTech
 
Text mining and social network analysis of twitter data part 1
Johan Blomme
 
my$talk=qr{((?:ir)?reg(?:ular )?exp(?:ressions?)?)}i;
dankogai
 
2. R-basics, Vectors, Arrays, Matrices, Factors
krishna singh
 
Wireless sensor network Apriori an N-RMP
Amrit Khandelwal
 
Python pandas Library
Md. Sohag Miah
 
Text analytics in Python and R with examples from Tobacco Control
Ben Healey
 
Future features for openCypher: Schema, Constraints, Subqueries, Configurable...
openCypher
 
Introduction to R Programming
izahn
 
Hybrid acquisition of temporal scopes for rdf data
Anisa Rula
 
Training in Analytics, R and Social Media Analytics
Ajay Ohri
 
Natural Language Processing in R (rNLP)
fridolin.wild
 
defense
Qing Dou
 
Stack Algorithm
Kamal Singh Lodhi
 
Managing large datasets in R – ff examples and concepts
Ajay Ohri
 
Merge Multiple CSV in single data frame using R
Yogesh Khandelwal
 
January 2016 Meetup: Speeding up (big) data manipulation with data.table package
Zurich_R_User_Group
 

Viewers also liked (12)

PDF
Accessing Databases from R
Jeffrey Breen
 
PDF
Slides erm-cea-ia
Arthur Charpentier
 
PDF
IA-advanced-R
Arthur Charpentier
 
PDF
Slides ads ia
Arthur Charpentier
 
PDF
Classification
Arthur Charpentier
 
PDF
Slides lln-risques
Arthur Charpentier
 
PDF
15 03 16_data sciences pour l'actuariat_f. soulie fogelman
Arthur Charpentier
 
PDF
Slides barcelona Machine Learning
Arthur Charpentier
 
PDF
Graduate Econometrics Course, part 4, 2017
Arthur Charpentier
 
PDF
Econometrics, PhD Course, #1 Nonlinearities
Arthur Charpentier
 
PDF
Slides econometrics-2017-graduate-2
Arthur Charpentier
 
PDF
Econometrics 2017-graduate-3
Arthur Charpentier
 
Accessing Databases from R
Jeffrey Breen
 
Slides erm-cea-ia
Arthur Charpentier
 
IA-advanced-R
Arthur Charpentier
 
Slides ads ia
Arthur Charpentier
 
Classification
Arthur Charpentier
 
Slides lln-risques
Arthur Charpentier
 
15 03 16_data sciences pour l'actuariat_f. soulie fogelman
Arthur Charpentier
 
Slides barcelona Machine Learning
Arthur Charpentier
 
Graduate Econometrics Course, part 4, 2017
Arthur Charpentier
 
Econometrics, PhD Course, #1 Nonlinearities
Arthur Charpentier
 
Slides econometrics-2017-graduate-2
Arthur Charpentier
 
Econometrics 2017-graduate-3
Arthur Charpentier
 
Ad

Similar to Introduction to R for Data Science :: Session 5 [Data Structuring: Strings in R] (20)

PDF
R Programming: Learn To Manipulate Strings In R
Rsquared Academy
 
PPTX
How to handling strings in r
Pramod Rathore
 
PPTX
Introduction To Programming In R for data analyst
ssuser26ff68
 
PDF
Eag 201110-hrugregexpresentation-111006104128-phpapp02
egoodwintx
 
PPTX
ComputeFest 2012: Intro To R for Physical Sciences
alexstorer
 
PDF
[1062BPY12001] Data analysis with R / week 2
Kevin Chun-Hsien Hsu
 
PPTX
1.R_For_Libraries_Session_2_-_Data_Exploration.pptx
pathanthecreator1
 
PDF
1_Overview.pdf
ssuser2d043c
 
PDF
Manipulating string data with a pattern in R
Lun-Hsien Chang
 
PPTX
R Programming Tutorial for Beginners - -TIB Academy
rajkamaltibacademy
 
PDF
R strings
Learnbay Datascience
 
PPT
R Programming Intro
062MayankSinghal
 
PDF
FULL R PROGRAMMING METERIAL_2.pdf
attalurilalitha
 
PPTX
chapte_6_String_python_bca_2005_computer
hansibansal
 
PPTX
R Introduction
schamber
 
PPTX
R1-Intro (2udsjhfkjdshfkjsdkfhsdkfsfsffs
sabari Giri
 
PDF
Lecture1_R.pdf
BusyBird2
 
PDF
Python programming : Strings
Emertxe Information Technologies Pvt Ltd
 
PPT
Lecture1_R Programming Introduction1.ppt
premak23
 
PPT
Modeling in R Programming Language for Beginers.ppt
anshikagoel52
 
R Programming: Learn To Manipulate Strings In R
Rsquared Academy
 
How to handling strings in r
Pramod Rathore
 
Introduction To Programming In R for data analyst
ssuser26ff68
 
Eag 201110-hrugregexpresentation-111006104128-phpapp02
egoodwintx
 
ComputeFest 2012: Intro To R for Physical Sciences
alexstorer
 
[1062BPY12001] Data analysis with R / week 2
Kevin Chun-Hsien Hsu
 
1.R_For_Libraries_Session_2_-_Data_Exploration.pptx
pathanthecreator1
 
1_Overview.pdf
ssuser2d043c
 
Manipulating string data with a pattern in R
Lun-Hsien Chang
 
R Programming Tutorial for Beginners - -TIB Academy
rajkamaltibacademy
 
R Programming Intro
062MayankSinghal
 
FULL R PROGRAMMING METERIAL_2.pdf
attalurilalitha
 
chapte_6_String_python_bca_2005_computer
hansibansal
 
R Introduction
schamber
 
R1-Intro (2udsjhfkjdshfkjsdkfhsdkfsfsffs
sabari Giri
 
Lecture1_R.pdf
BusyBird2
 
Python programming : Strings
Emertxe Information Technologies Pvt Ltd
 
Lecture1_R Programming Introduction1.ppt
premak23
 
Modeling in R Programming Language for Beginers.ppt
anshikagoel52
 
Ad

More from Goran S. Milovanovic (20)

PDF
Uvod u R za Data Science :: Sesija 1 [Intro to R for Data Science :: Session 1]
Goran S. Milovanovic
 
PDF
Geneva Social Media Index - Report 2015 full report
Goran S. Milovanovic
 
PDF
Milovanović, G.S., Krstić, M. & Filipović, O. (2015). Kršenje homogenosti pre...
Goran S. Milovanovic
 
PDF
247113920-Cognitive-technologies-mapping-the-Internet-governance-debate
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 10. Simboličke funkcije, VI Deo: Rešavanje p...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Rezonovanje u...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Suđenje, heur...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 8. Simboličke funkcije, IV Deo: Analogija i ...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 9. Simboličke funkcije, III Deo: Kauzalnost,...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Distribuiran...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Konekcioniza...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 7a. Simboličke funkcije, I Deo: Učenje kateg...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 7. Simboličke funkcije, I Deo: Koncepti, kat...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 7. Učenje, IV Deo: Neasocijativno učenje, ef...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Hernstejnov zakon slagan...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Instrumentalno učenje
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 5. Učenje, II Deo: Blokiranje, osenčavanje, ...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 5. Učenje, II Deo: klasično uslovljavanje i ...
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 5. Učenje, I Deo
Goran S. Milovanovic
 
PDF
Učenje i viši kognitivni procesi 4a. Debata o racionalnosti, nastavak
Goran S. Milovanovic
 
Uvod u R za Data Science :: Sesija 1 [Intro to R for Data Science :: Session 1]
Goran S. Milovanovic
 
Geneva Social Media Index - Report 2015 full report
Goran S. Milovanovic
 
Milovanović, G.S., Krstić, M. & Filipović, O. (2015). Kršenje homogenosti pre...
Goran S. Milovanovic
 
247113920-Cognitive-technologies-mapping-the-Internet-governance-debate
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 10. Simboličke funkcije, VI Deo: Rešavanje p...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Rezonovanje u...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 9. Simboličke funkcije, V Deo: Suđenje, heur...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 8. Simboličke funkcije, IV Deo: Analogija i ...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 9. Simboličke funkcije, III Deo: Kauzalnost,...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Distribuiran...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 8. Simboličke funkcije, II Deo: Konekcioniza...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 7a. Simboličke funkcije, I Deo: Učenje kateg...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 7. Simboličke funkcije, I Deo: Koncepti, kat...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 7. Učenje, IV Deo: Neasocijativno učenje, ef...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Hernstejnov zakon slagan...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 6. Učenje, III Deo: Instrumentalno učenje
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 5. Učenje, II Deo: Blokiranje, osenčavanje, ...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 5. Učenje, II Deo: klasično uslovljavanje i ...
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 5. Učenje, I Deo
Goran S. Milovanovic
 
Učenje i viši kognitivni procesi 4a. Debata o racionalnosti, nastavak
Goran S. Milovanovic
 

Recently uploaded (20)

PDF
Andreas Schleicher_Teaching Compass_Education 2040.pdf
EduSkills OECD
 
PPT
M&A5 Q1 1 differentiate evolving early Philippine conventional and contempora...
ErlizaRosete
 
PPTX
How to use _name_search() method in Odoo 18
Celine George
 
PDF
Gladiolous Cultivation practices by AKL.pdf
kushallamichhame
 
PPTX
ENGLISH -PPT- Week1 Quarter1 -day-1.pptx
garcialhavz
 
PDF
Romanticism in Love and Sacrifice An Analysis of Oscar Wilde’s The Nightingal...
KaryanaTantri21
 
PPTX
How to use grouped() method in Odoo 18 - Odoo Slides
Celine George
 
PDF
DIGESTION OF CARBOHYDRATES ,PROTEINS AND LIPIDS
raviralanaresh2
 
PPTX
Martyrs of Ireland - who kept the faith of St. Patrick.pptx
Martin M Flynn
 
PPTX
SYMPATHOMIMETICS[ADRENERGIC AGONISTS] pptx
saip95568
 
PPTX
How to Add New Item in CogMenu in Odoo 18
Celine George
 
PDF
Lesson 1 : Science and the Art of Geography Ecosystem
marvinnbustamante1
 
DOCX
MUSIC AND ARTS 5 DLL MATATAG LESSON EXEMPLAR QUARTER 1_Q1_W1.docx
DianaValiente5
 
PPTX
How to Create & Manage Stages in Odoo 18 Helpdesk
Celine George
 
PDF
Rapid Mathematics Assessment Score sheet for all Grade levels
DessaCletSantos
 
PPTX
2025 Completing the Pre-SET Plan Form.pptx
mansk2
 
PPTX
F-BLOCK ELEMENTS POWER POINT PRESENTATIONS
mprpgcwa2024
 
PPTX
JSON, XML and Data Science introduction.pptx
Ramakrishna Reddy Bijjam
 
PDF
THE PSYCHOANALYTIC OF THE BLACK CAT BY EDGAR ALLAN POE (1).pdf
nabilahk908
 
PPTX
How to Configure Taxes in Company Currency in Odoo 18 Accounting
Celine George
 
Andreas Schleicher_Teaching Compass_Education 2040.pdf
EduSkills OECD
 
M&A5 Q1 1 differentiate evolving early Philippine conventional and contempora...
ErlizaRosete
 
How to use _name_search() method in Odoo 18
Celine George
 
Gladiolous Cultivation practices by AKL.pdf
kushallamichhame
 
ENGLISH -PPT- Week1 Quarter1 -day-1.pptx
garcialhavz
 
Romanticism in Love and Sacrifice An Analysis of Oscar Wilde’s The Nightingal...
KaryanaTantri21
 
How to use grouped() method in Odoo 18 - Odoo Slides
Celine George
 
DIGESTION OF CARBOHYDRATES ,PROTEINS AND LIPIDS
raviralanaresh2
 
Martyrs of Ireland - who kept the faith of St. Patrick.pptx
Martin M Flynn
 
SYMPATHOMIMETICS[ADRENERGIC AGONISTS] pptx
saip95568
 
How to Add New Item in CogMenu in Odoo 18
Celine George
 
Lesson 1 : Science and the Art of Geography Ecosystem
marvinnbustamante1
 
MUSIC AND ARTS 5 DLL MATATAG LESSON EXEMPLAR QUARTER 1_Q1_W1.docx
DianaValiente5
 
How to Create & Manage Stages in Odoo 18 Helpdesk
Celine George
 
Rapid Mathematics Assessment Score sheet for all Grade levels
DessaCletSantos
 
2025 Completing the Pre-SET Plan Form.pptx
mansk2
 
F-BLOCK ELEMENTS POWER POINT PRESENTATIONS
mprpgcwa2024
 
JSON, XML and Data Science introduction.pptx
Ramakrishna Reddy Bijjam
 
THE PSYCHOANALYTIC OF THE BLACK CAT BY EDGAR ALLAN POE (1).pdf
nabilahk908
 
How to Configure Taxes in Company Currency in Odoo 18 Accounting
Celine George
 

Introduction to R for Data Science :: Session 5 [Data Structuring: Strings in R]

  • 1. Introduction to R for Data Science Lecturers dipl. ing Branko Kovač Data Analyst at CUBE/Data Science Mentor at Springboard Data Science zajednica Srbije [email protected] dr Goran S. Milovanović Data Scientist at DiploFoundation Data Science zajednica Srbije [email protected] [email protected]
  • 2. Strings in R • {base} for strings • {stringr} for strings • {stringi} for strings Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Processing strings in R library(stringr) # strings in R are charactervectors stringA <- "Hello world" stringB <- "Sun shines!" stringA stringB is.character(stringA) # TRUE as.character(200*5) as.numeric("1000") as.double("3.14") # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Using " and ' # either: stringA <- "Hello 'World'" stringA # or stringA <- 'Hello "World"' stringA # prints:"Hello "World"" - what is this: ? print(stringA) # try: writeLines(stringA) print(stringA) # Escapingin R: use , the R escape character stringA <- 'Hello "World"' stringA print(stringA) writeLines(stringA) # Escapingescaping writeLines("")# nice
  • 3. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # String Concatenationin R stringC <- c(stringA,stringB) # a character vectorof length == 2 length(stringC) stringC <- paste(stringA,stringB, sep=",") # length == 1, base function writeLines(stringC) # sep w. collapse (paste args) stringC <- c(stringA,stringB) stringC <- paste(stringC,collapse="__") writeLines(stringC) # paste0 is paste w. sep="",fasterthan paste(),base function # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 strA <- "One" strB <- "Two" strC <- "Three" paste0(strA,strB, strC) # the collapse argumentis used in paste0 as well strD <- c(strA,strB,strC) paste0(strD,collapse="-") # stringr concatenation,also has sep and collapse as args str_c(strA,strB,strC) str_c(strA,strB,strC,sep="...") str_c(strD,collapse="...") # both paste {base}and str_c {stringr} are vectorized paste("Prefix-",strD, sep="-") str_c("Prefix-",strD,sep="-") Strings in R • Concatenation
  • 4. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 stringA <- "The quick brown fox jumps overthe lazy dog"; splitA <- strsplit(stringA," ") # is.list(splitA) == T splitA <- unlist(strsplit(stringA," ")) # "The quick brown" from "The quick brown fox jumps overthe lazy dog" splitA <- paste(unlist(strsplit(stringA," "))[1:3],collapse=" ") # or splitA <- paste(strsplit(stringA," ")[[1]][1:3],collapse=" ") # advice:use splitA <- strsplit(stringA," ",fixed=T) # fixed=T says:match the split argumentexactly, # otherwise,split is an regularexpression;defaultis: fixed = FALSE # string split w. {stringr} is.list(str_split(stringA," ")) # this is interesting: str_split(stringA," ", n=3) # "The quick brown" from "The quick brown fox jumps overthe lazy dog" paste0(str_split(stringA," ", n=4)[[1]][1:3],collapse=" ") Strings in R • Splitting
  • 5. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # default: str_split(string,pattern,n = Inf), where pattern is regex str_split(stringA,boundary("word")) # very useful: stringA1 <- "The quick brown fox jumps overthe lazy dog" str_split(stringA1,boundary("word")) stringA1 <- "Aboveall, don'tlie to yourself. The man who lies to himselfand listens to his own lie comes to a pointthat he cannotdistinguish the truth within him, or around him,and so loses all respectfor himselfand for others. And having no respecthe ceasesto love." str_split(stringA1,boundary("word")) str_split(stringA1,boundary("word",skip_word_none= F)) # includingpunctuation and special str_split(stringA1,boundary("line_break")) writeLines(str_split(stringA1,boundary("line_break"))[[1]]) Strings in R • Splitting
  • 6. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 stringA <- c("Belgrade","Zagreb","Ljubljana")# {stringr} str_sub(stringA,1, 2) # counting backwards str_sub(stringA, -3, -1) # {base} substr(stringA,1, 3) # play: substr(stringA,c(1,2,3),c(2,3,4)) # nope: substr(stringA, -2, -1) # {base} Strings in R • Subsetting strings
  • 7. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Replacingcharactersin strings stringB <- stringA # just a copy of stringA str_sub(stringB,1,2)<- "00" stringB # {base} stringB <- stringA # just a copy of stringA substr(stringB,1,3)<- "WowWow" # check the result! stringB substr(stringB,1,4)<- "WoWWow" # check the result! stringB substr(stringB,1,6)<- "WowWow" # check the result! stringB Strings in R • Subsetting strings # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # UPPER CASE to lower case and vice versa in R stringA <- "ABRACADABRA" # {base} tolower(stringA) stringA <- tolower(stringA) toupper(stringA) stringA <- toupper(stringA) # {stringr} str_to_lower(stringA) stringB <- str_to_lower(stringA) str_to_upper(stringA) # capitalize first letter str_to_title(stringB) • Transforming strings
  • 8. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Remove whitespace stringA <- c(" Removewhitespace "); str_trim(stringA) # remove leading whitespace str_trim(stringA,side="left") # remove trailing whitespace str_trim(stringA,side="right") # remove all whitespace? stringA <- c(" Remove whitespace ") # how aboutthis one? # there are differentways to do it. Try: gsub(" ", "", stringA,fixed=T) # (!(fixed==T)),the first (pattern) argumentis regex # in general: stringA <- "The quick brown fox jumps overthe lazy dog The quick brown" gsub("Thequick brown","The slow red", stringA,fixed=T) Strings in R • More transforming
  • 9. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Searchingfor somethingin a string # Does a string encompass a substring? grepl("Thequick brown",stringA,fixed = T) grepl("Thefastred", stringA, fixed = T) stringB <- "Uraaaaaaaa" grep("Thequick brown",c(stringA,stringB),fixed = T) # where? stringA <- "The quick brown fox jumps overthe lazy dog The quick brown" w <- gregexpr("Thequick brown",stringA) str(w) b1 <- w[[1]][1] # first match starts at b2 <- w[[1]][2] # second match starts at # now, match.length is an attribute of w[[1]], not w itself: e1 <- attr(w[[1]],"match.length",exact= T)[1] e2 <- attr(w[[1]],"match.length",exact= T)[2] Strings in R • Search
  • 10. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # first match extraction: str_sub(stringA,b1,b1+e1-1) # second matchextraction: str_sub(stringA,b2,b2+e2-1) # Ok, but easierand more convenientwith {stringr} str_detect(stringA,"The quickbrown") # T or F str_locate(stringA,"The quickbrown") # first match str_locate_all(stringA,"The quickbrown") # all matches # term frequency,as we know,is very importantin text-mining: term1 <- str_locate_all(stringA,"The quickbrown")[[1]]# all matches for term1 ie. "The quick brown" dim(term1)[1] # how many matches = how many rows in the str_locate_alloutputmatrix... Strings in R • Search
  • 11. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Sorting strings in R letters str_sort(letters,locale="en")# locale = en str_sort(letters,locale="haw")# locale = Hawaiian # backwards str_sort(letters,decreasing= T) # handy: stringA <- c("New York","Paris",NA, "Moscow","Tokyo") str_sort(stringA,na_last=T) # [1] "Moscow" "New York" "Paris" "Tokyo" NA str_sort(stringA,na_last=F) # [1] NA "Moscow" "New York" "Paris" "Tokyo" # {base} sort(stringA) sort(stringA,decreasing=T) Strings in R • Sorting strings
  • 12. Intro to R for Data Science Session 5: Structuring Data: Strings in R # Introduction to R for Data Science # SESSION 5 :: 26 May, 2016 # Take home messageon encodings # 1. Most of the time, you simply need to know the source encoding # 2. All of the time *** converteverythingto UTF-8*** - as soon as possible # 3. Most {base},and all {stringr} and {stringi} functions thatprocessstrings in R # will converttheir outputto UTF-8 automatically # Working inside R only, running an English locale,will nevercause you any trouble # However,in Data Science you will probably needto do a lot of web-scraping fora living # - and that's where the fan starts. # God bless iconv()- but don'tget to excited,it does not avoid all problems # Next session:Thurday,June2, 2016 :: LinearRegressionw. R Strings in R • Encodings…