SlideShare a Scribd company logo
1
Connect With Us
Website ( )
Free Online R Courses ( )
R Packages ( )
Shiny Apps ( )
Blog ( )
GitHub ( )
YouTube ( )
Twitter ( )
Facebook ( )
Linkedin ( )
• https://www.rsquaredacademy.com/
• https://rsquared-academy.thinkific.com/
• https://pkgs.rsquaredacademy.com
• https://apps.rsquaredacademy.com
• https://blog.rsquaredacademy.com
• https://github.com/rsquaredacademy
• https://www.youtube.com/user/rsquaredin/
• https://twitter.com/rsquaredacademy
• https://www.facebook.com/rsquaredacademy/
• https://in.linkedin.com/company/rsquared-academy
2
what?
why?
how?
use cases
HTML basics
case studies
•
•
•
•
•
•
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Libraries
library(robotstxt)
library(rvest)
library(selectr)
library(xml2)
library(dplyr)
library(stringr)
library(forcats)
library(magrittr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(tibble)
library(purrr)
20
21
robotstxt
paths_allowed(
paths = c("https://www.imdb.com/search/title?groups=top_250&sort=user_
)
##
www.imdb.com No encoding supplied: defaulting to U
## [1] TRUE
22
Read Web Page
imdb <- read_html("https://www.imdb.com/search/title?groups=top_250&sort
imdb
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/
## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars
## [2] <body id="styleguide-v2" class="fixed">nn <img heigh
23
24
Title
imdb %>%
html_nodes(".lister-item-content h3 a") %>%
html_text() -> movie_title
movie_title
## [1] "The Shawshank Redemption"
## [2] "The Godfather"
## [3] "The Dark Knight"
## [4] "The Godfather: Part II"
## [5] "The Lord of the Rings: The Return of the King"
## [6] "Pulp Fiction"
## [7] "Schindler's List"
## [8] "Il buono, il brutto, il cattivo"
## [9] "12 Angry Men"
## [10] "Inception"
## [11] "Fight Club"
## [12] "The Lord of the Rings: The Fellowship of the Ring"
## [13] "Forrest Gump"
## [14] "The Lord of the Rings: The Two Towers"
## [15] "The Matrix"
## [16] "Goodfellas"
## [17] "Star Wars: Episode V - The Empire Strikes Back"
25
26
Year of Release
imdb %>%
html_nodes(".lister-item-content h3 .lister-item-year") %>%
html_text() %>%
str_sub(start = 2, end = 5) %>%
as.Date(format = "%Y") %>%
year() -> movie_year
movie_year
## [1] 1994 1972 2008 1974 2003 1994 1993 1966 1957 2010 1999 2001 1994
## [15] 1999 1990 1980 1975 1954 2014 2002 2001 1998 1999 1997 1995 1995
## [29] 1991 1977 1946 2018 2016 2018 2018 2014 2011 2006 2006 2002 2000
## [43] 1998 1994 1991 1988 1988 1985 1981 1979
27
28
Certificate
imdb %>%
html_nodes(".lister-item-content p .certificate") %>%
html_text() -> movie_certificate
movie_certificate
## [1] "A" "A" "UA" "PG-13" "A" "A" "UA" "A"
## [9] "PG-13" "PG-13" "PG-13" "A" "A" "PG" "UA" "R"
## [17] "PG" "A" "A" "PG-13" "A" "R" "A" "A"
## [25] "U" "PG" "UA" "U" "U" "UA" "A" "UA"
## [33] "PG-13" "A" "R" "R" "R" "A" "U" "U"
## [41] "R" "U" "PG" "R"
29
30
Runtime
imdb %>%
html_nodes(".lister-item-content p .runtime") %>%
html_text() %>%
str_split(" ") %>%
map_chr(1) %>%
as.numeric() -> movie_runtime
movie_runtime
## [1] 142 175 152 202 201 154 195 161 96 148 139 178 142 179 136 146
## [18] 133 207 169 130 125 169 189 116 106 127 110 118 121 130 139 161
## [35] 149 106 112 130 151 150 113 155 119 88 137 155 89 116 115 147
31
32
Genre
imdb %>%
html_nodes(".lister-item-content p .genre") %>%
html_text() %>%
str_trim() -> movie_genre
movie_genre
## [1] "Drama" "Crime, Drama"
## [3] "Action, Crime, Drama" "Crime, Drama"
## [5] "Adventure, Drama, Fantasy" "Crime, Drama"
## [7] "Biography, Drama, History" "Western"
## [9] "Drama" "Action, Adventure, Sci-Fi"
## [11] "Drama" "Adventure, Drama, Fantasy"
## [13] "Drama, Romance" "Adventure, Drama, Fantasy"
## [15] "Action, Sci-Fi" "Biography, Crime, Drama"
## [17] "Action, Adventure, Fantasy" "Drama"
## [19] "Adventure, Drama" "Adventure, Drama, Sci-Fi"
## [21] "Crime, Drama" "Animation, Adventure, Family"
## [23] "Drama, War" "Crime, Drama, Fantasy"
## [25] "Comedy, Drama, Romance" "Crime, Mystery, Thriller"
## [27] "Crime, Drama, Mystery" "Action, Crime, Drama"
## [29] "Crime, Drama, Thriller" "Action, Adventure, Fantasy"
## [31] "Drama, Family, Fantasy" "Crime, Thriller" 33
34
Rating
imdb %>%
html_nodes(".ratings-bar .ratings-imdb-rating") %>%
html_attr("data-value") %>%
as.numeric() -> movie_rating
movie_rating
## [1] 9.3 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7
## [18] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5
## [35] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5
35
36
37
Votes
imdb %>%
html_nodes(xpath = '//meta[@itemprop="ratingCount"]') %>%
html_attr('content') %>%
as.numeric() -> movie_votes
movie_votes
## [1] 2072893 1422292 2038787 987020 1475650 1621033 1074273 615219
## [9] 585562 1817393 1658750 1492209 1589127 1334563 1489071 895033
## [17] 1040130 822277 280024 1276946 637716 549410 1096231 1000909
## [25] 545280 897576 1271530 913352 1118817 1109777 352837 39132
## [33] 118413 174125 617621 605417 666327 1052901 1064050 633675
## [41] 1021511 1198326 941917 823238 897607 198398 192715 923178
## [49] 803033 542311
38
39
Revenue
imdb %>%
html_nodes(xpath = '//span[@name="nv"]') %>%
html_text() %>%
str_extract(pattern = "^$.*") %>%
na.omit() %>%
as.character() %>%
append(values = NA, after = 30) %>%
append(values = NA, after = 46) %>%
str_sub(start = 2, end = nchar(.) - 1) %>%
as.numeric() -> movie_revenue
movie_revenue
## [1] 28.34 134.97 534.86 57.30 377.85 107.93 96.07 6.10 4.36 2
## [11] 37.03 315.54 330.25 342.55 171.48 46.84 290.48 112.00 0.27 1
## [21] 7.56 10.06 216.54 136.80 57.60 23.34 100.13 19.50 130.74 3
## [31] NA 1.19 12.39 190.24 678.82 13.09 13.18 53.09 132.38
## [41] 25.54 187.71 6.72 312.90 204.84 11.99 NA 210.61 248.16
40
Putting it all together…
top_50 <- tibble(title = movie_title, release = movie_year,
`runtime (mins)` = movie_runtime, genre = movie_genre, rating = movi
votes = movie_votes, `revenue ($ millions)` = movie_revenue)
top_50
## # A tibble: 50 x 7
## title release `runtime (mins)` genre rating votes `revenue (
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 The Sha~ 1994 142 Drama 9.3 2.07e6
## 2 The God~ 1972 175 Crime,~ 9.2 1.42e6
## 3 The Dar~ 2008 152 Action~ 9 2.04e6
## 4 The God~ 1974 202 Crime,~ 9 9.87e5
## 5 The Lor~ 2003 201 Advent~ 8.9 1.48e6
## 6 Pulp Fi~ 1994 154 Crime,~ 8.9 1.62e6
## 7 Schindl~ 1993 195 Biogra~ 8.9 1.07e6
## 8 Il buon~ 1966 161 Western 8.9 6.15e5
## 9 12 Angr~ 1957 96 Drama 8.9 5.86e5
## 10 Incepti~ 2010 148 Action~ 8.8 1.82e6
## # ... with 40 more rows
41
42
robotstxt
paths_allowed(
paths = c("https://en.wikipedia.org/wiki/List_of_Governors_of_Reserve_
)
##
en.wikipedia.org
## [1] TRUE
43
Read Web Page
rbi_guv <- read_html("https://en.wikipedia.org/wiki/List_of_Governors_of
rbi_guv
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-
44
List of Governors
rbi_guv %>%
html_nodes("table") %>%
html_table() %>%
extract2(2) -> profile
profile
## No. Officeholder Portrait Term start Term
## 1 1 Osborne Smith NA 1 April 1935 30 June 1
## 2 2 James Braid Taylor NA 1 July 1937 17 February 1
## 3 3 C. D. Deshmukh NA 11 August 1943ii 30 May 1
## 4 4 Benegal Rama Rau NA 1 July 1949 14 January 1
## 5 5 K. G. Ambegaonkar NA 14 January 1957 28 February 1
## 6 6 H. V. R. Iyengar NA 1 March 1957 28 February 1
## 7 7 P. C. Bhattacharya NA 1 March 1962 30 June 1
## 8 8 Lakshmi Kant Jha NA 1 July 1967 3 May 1
## 9 9 B. N. Adarkar NA 4 May 1970 15 June 1
## 10 10 Sarukkai Jagannathan NA 16 June 1970 19 May 1
## 11 11 N. C. Sen Gupta NA 19 May 1975 19 August 1
## 12 12 K. R. Puri NA 20 August 1975 2 May 1
## 13 13 M. Narasimham NA 3 May 1977 30 November 1
## 14 14 I. G. Patel NA 1 December 1977 15 September 1
## 15 15 Manmohan Singh NA 16 September 1982 14 January 1 45
Sort
profile %>%
separate(`Term in office`, into = c("term", "days")) %>%
select(Officeholder, term) %>%
arrange(desc(as.numeric(term)))
## Officeholder term
## 1 Benegal Rama Rau 2754
## 2 C. D. Deshmukh 2150
## 3 R. N. Malhotra 2147
## 4 Bimal Jalan 2114
## 5 James Braid Taylor 2057
## 6 P. C. Bhattacharya 1947
## 7 Y. Venugopal Reddy 1826
## 8 H. V. R. Iyengar 1825
## 9 D. Subbarao 1825
## 10 Sarukkai Jagannathan 1798
## 11 C. Rangarajan 1795
## 12 I. G. Patel 1749
## 13 Raghuram Rajan 1096
## 14 Lakshmi Kant Jha 1037
## 15 Urjit Patel 947
## 16 Manmohan Singh 851
46
Backgrounds
profile %>%
count(Background)
## # A tibble: 9 x 2
## Background n
## <chr> <int>
## 1 "" 1
## 2 Banker 2
## 3 Career Reserve Bank of India officer 1
## 4 Economist 7
## 5 IAS officer 4
## 6 ICS officer 7
## 7 Indian Administrative Service (IAS) officer 1
## 8 Indian Audit and Accounts Service officer 1
## 9 Indian Civil Service (ICS) officer 1
47
Backgrounds
profile %>%
pull(Background) %>%
fct_collapse(
Bureaucrats = c("IAS officer", "ICS officer",
"Indian Administrative Service (IAS) officer",
"Indian Audit and Accounts Service officer",
"Indian Civil Service (ICS) officer"),
`No Info` = c(""),
`RBI Officer` = c("Career Reserve Bank of India officer")
) %>%
fct_count() %>%
rename(background = f, count = n) -> backgrounds
48
Backgrounds
backgrounds
## # A tibble: 5 x 2
## background count
## <fct> <int>
## 1 No Info 1
## 2 Banker 2
## 3 RBI Officer 1
## 4 Economist 7
## 5 Bureaucrats 14
49
Backgrounds
backgrounds %>%
ggplot() +
geom_col(aes(background, count), fill = "blue") +
xlab("Background") + ylab("Count") +
ggtitle("Background of RBI Governors")
50
51
Summary
web scraping is the extraction of data from web sites
best for static & well structured HTML pages
review robots.txt file
HTML code can change any time
if API is available, please use it
do not overwhelm websites with requests
•
•
•
•
•
•
52
53
Ad

Recommended

Indexing Complex PostgreSQL Data Types
Indexing Complex PostgreSQL Data Types
Jonathan Katz
 
MySQL・PostgreSQLだけで作る高速あいまい全文検索システム
MySQL・PostgreSQLだけで作る高速あいまい全文検索システム
Kouhei Sutou
 
Developing MIPS Exploits to Hack Routers
Developing MIPS Exploits to Hack Routers
BGA Cyber Security
 
Agile and DevOps
Agile and DevOps
Yasunobu Kawaguchi
 
OpenTelemetryを用いたObservability基礎の実装 with AWS Distro for OpenTelemetry(Kuberne...
OpenTelemetryを用いたObservability基礎の実装 with AWS Distro for OpenTelemetry(Kuberne...
NTT DATA Technology & Innovation
 
YugabyteDBの実行計画を眺める(NewSQL/分散SQLデータベースよろず勉強会 #3 発表資料)
YugabyteDBの実行計画を眺める(NewSQL/分散SQLデータベースよろず勉強会 #3 発表資料)
NTT DATA Technology & Innovation
 
Redmineでメトリクスを見える化する方法
Redmineでメトリクスを見える化する方法
Hidehisa Matsutani
 
PostgreSQL - C言語によるユーザ定義関数の作り方
PostgreSQL - C言語によるユーザ定義関数の作り方
Satoshi Nagayasu
 
pg_bigmを用いた全文検索のしくみ(前編)
pg_bigmを用いた全文検索のしくみ(前編)
NTT DATA OSS Professional Services
 
リアルタイムOSの必要性とTOPPERS/SSPの紹介
リアルタイムOSの必要性とTOPPERS/SSPの紹介
NSaitoNmiri
 
シリコンバレーでエンジニア就職する前に知りたかったこと
シリコンバレーでエンジニア就職する前に知りたかったこと
Tatsuya Nanjo
 
SSRF For Bug Bounties
SSRF For Bug Bounties
OWASP Nagpur
 
PostgreSQLアーキテクチャ入門(INSIGHT OUT 2011)
PostgreSQLアーキテクチャ入門(INSIGHT OUT 2011)
Uptime Technologies LLC (JP)
 
MroongaとPGroonga
MroongaとPGroonga
Kouhei Sutou
 
PostgreSQLの統計情報について(第26回PostgreSQLアンカンファレンス@オンライン 発表資料)
PostgreSQLの統計情報について(第26回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
Presentación Spring Boot en Autentia
Presentación Spring Boot en Autentia
Jorge Pacheco Mengual
 
Memoizeの仕組み(第41回PostgreSQLアンカンファレンス@オンライン 発表資料)
Memoizeの仕組み(第41回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
pg_bigmで全文検索するときに気を付けたい5つのポイント(第23回PostgreSQLアンカンファレンス@オンライン 発表資料)
pg_bigmで全文検索するときに気を付けたい5つのポイント(第23回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
Logstashを愛して5年、370ページを超えるガチ本を書いてしまった男の話.
Logstashを愛して5年、370ページを超えるガチ本を書いてしまった男の話.
Hibino Hisashi
 
Django REST Framework における API 実装プラクティス | PyCon JP 2018
Django REST Framework における API 実装プラクティス | PyCon JP 2018
Masashi Shibata
 
PostgreSQLモニタリング機能の現状とこれから(Open Developers Conference 2020 Online 発表資料)
PostgreSQLモニタリング機能の現状とこれから(Open Developers Conference 2020 Online 発表資料)
NTT DATA Technology & Innovation
 
SSRF workshop
SSRF workshop
Ivan Novikov
 
Go言語によるwebアプリの作り方
Go言語によるwebアプリの作り方
Yasutaka Kawamoto
 
Working with JSON Data in PostgreSQL vs. MongoDB
Working with JSON Data in PostgreSQL vs. MongoDB
ScaleGrid.io
 
Yahoo! JAPANのプライベートRDBクラウドとマルチライター型 MySQL #dbts2017 #dbtsOSS
Yahoo! JAPANのプライベートRDBクラウドとマルチライター型 MySQL #dbts2017 #dbtsOSS
Yahoo!デベロッパーネットワーク
 
闇魔術を触ってみた
闇魔術を触ってみた
Satoshi Sato
 
PostgreSQL共有バッファと関連ツール
PostgreSQL共有バッファと関連ツール
Masahiko Sawada
 
PostgreSQL 15 開発最新情報
PostgreSQL 15 開発最新情報
Masahiko Sawada
 
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Raman Kannan
 
Writing Readable Code with Pipes
Writing Readable Code with Pipes
Rsquared Academy
 

More Related Content

What's hot (20)

pg_bigmを用いた全文検索のしくみ(前編)
pg_bigmを用いた全文検索のしくみ(前編)
NTT DATA OSS Professional Services
 
リアルタイムOSの必要性とTOPPERS/SSPの紹介
リアルタイムOSの必要性とTOPPERS/SSPの紹介
NSaitoNmiri
 
シリコンバレーでエンジニア就職する前に知りたかったこと
シリコンバレーでエンジニア就職する前に知りたかったこと
Tatsuya Nanjo
 
SSRF For Bug Bounties
SSRF For Bug Bounties
OWASP Nagpur
 
PostgreSQLアーキテクチャ入門(INSIGHT OUT 2011)
PostgreSQLアーキテクチャ入門(INSIGHT OUT 2011)
Uptime Technologies LLC (JP)
 
MroongaとPGroonga
MroongaとPGroonga
Kouhei Sutou
 
PostgreSQLの統計情報について(第26回PostgreSQLアンカンファレンス@オンライン 発表資料)
PostgreSQLの統計情報について(第26回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
Presentación Spring Boot en Autentia
Presentación Spring Boot en Autentia
Jorge Pacheco Mengual
 
Memoizeの仕組み(第41回PostgreSQLアンカンファレンス@オンライン 発表資料)
Memoizeの仕組み(第41回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
pg_bigmで全文検索するときに気を付けたい5つのポイント(第23回PostgreSQLアンカンファレンス@オンライン 発表資料)
pg_bigmで全文検索するときに気を付けたい5つのポイント(第23回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
Logstashを愛して5年、370ページを超えるガチ本を書いてしまった男の話.
Logstashを愛して5年、370ページを超えるガチ本を書いてしまった男の話.
Hibino Hisashi
 
Django REST Framework における API 実装プラクティス | PyCon JP 2018
Django REST Framework における API 実装プラクティス | PyCon JP 2018
Masashi Shibata
 
PostgreSQLモニタリング機能の現状とこれから(Open Developers Conference 2020 Online 発表資料)
PostgreSQLモニタリング機能の現状とこれから(Open Developers Conference 2020 Online 発表資料)
NTT DATA Technology & Innovation
 
SSRF workshop
SSRF workshop
Ivan Novikov
 
Go言語によるwebアプリの作り方
Go言語によるwebアプリの作り方
Yasutaka Kawamoto
 
Working with JSON Data in PostgreSQL vs. MongoDB
Working with JSON Data in PostgreSQL vs. MongoDB
ScaleGrid.io
 
Yahoo! JAPANのプライベートRDBクラウドとマルチライター型 MySQL #dbts2017 #dbtsOSS
Yahoo! JAPANのプライベートRDBクラウドとマルチライター型 MySQL #dbts2017 #dbtsOSS
Yahoo!デベロッパーネットワーク
 
闇魔術を触ってみた
闇魔術を触ってみた
Satoshi Sato
 
PostgreSQL共有バッファと関連ツール
PostgreSQL共有バッファと関連ツール
Masahiko Sawada
 
PostgreSQL 15 開発最新情報
PostgreSQL 15 開発最新情報
Masahiko Sawada
 
リアルタイムOSの必要性とTOPPERS/SSPの紹介
リアルタイムOSの必要性とTOPPERS/SSPの紹介
NSaitoNmiri
 
シリコンバレーでエンジニア就職する前に知りたかったこと
シリコンバレーでエンジニア就職する前に知りたかったこと
Tatsuya Nanjo
 
SSRF For Bug Bounties
SSRF For Bug Bounties
OWASP Nagpur
 
PostgreSQLアーキテクチャ入門(INSIGHT OUT 2011)
PostgreSQLアーキテクチャ入門(INSIGHT OUT 2011)
Uptime Technologies LLC (JP)
 
MroongaとPGroonga
MroongaとPGroonga
Kouhei Sutou
 
PostgreSQLの統計情報について(第26回PostgreSQLアンカンファレンス@オンライン 発表資料)
PostgreSQLの統計情報について(第26回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
Presentación Spring Boot en Autentia
Presentación Spring Boot en Autentia
Jorge Pacheco Mengual
 
Memoizeの仕組み(第41回PostgreSQLアンカンファレンス@オンライン 発表資料)
Memoizeの仕組み(第41回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
pg_bigmで全文検索するときに気を付けたい5つのポイント(第23回PostgreSQLアンカンファレンス@オンライン 発表資料)
pg_bigmで全文検索するときに気を付けたい5つのポイント(第23回PostgreSQLアンカンファレンス@オンライン 発表資料)
NTT DATA Technology & Innovation
 
Logstashを愛して5年、370ページを超えるガチ本を書いてしまった男の話.
Logstashを愛して5年、370ページを超えるガチ本を書いてしまった男の話.
Hibino Hisashi
 
Django REST Framework における API 実装プラクティス | PyCon JP 2018
Django REST Framework における API 実装プラクティス | PyCon JP 2018
Masashi Shibata
 
PostgreSQLモニタリング機能の現状とこれから(Open Developers Conference 2020 Online 発表資料)
PostgreSQLモニタリング機能の現状とこれから(Open Developers Conference 2020 Online 発表資料)
NTT DATA Technology & Innovation
 
Go言語によるwebアプリの作り方
Go言語によるwebアプリの作り方
Yasutaka Kawamoto
 
Working with JSON Data in PostgreSQL vs. MongoDB
Working with JSON Data in PostgreSQL vs. MongoDB
ScaleGrid.io
 
Yahoo! JAPANのプライベートRDBクラウドとマルチライター型 MySQL #dbts2017 #dbtsOSS
Yahoo! JAPANのプライベートRDBクラウドとマルチライター型 MySQL #dbts2017 #dbtsOSS
Yahoo!デベロッパーネットワーク
 
闇魔術を触ってみた
闇魔術を触ってみた
Satoshi Sato
 
PostgreSQL共有バッファと関連ツール
PostgreSQL共有バッファと関連ツール
Masahiko Sawada
 
PostgreSQL 15 開発最新情報
PostgreSQL 15 開発最新情報
Masahiko Sawada
 

Similar to Practical Introduction to Web scraping using R (20)

Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Raman Kannan
 
Writing Readable Code with Pipes
Writing Readable Code with Pipes
Rsquared Academy
 
第3回 データフレームの基本操作 その1(解答付き)
第3回 データフレームの基本操作 その1(解答付き)
Wataru Shito
 
MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709
Min-hyung Kim
 
Data manipulation and visualization in r 20190711 myanmarucsy
Data manipulation and visualization in r 20190711 myanmarucsy
SmartHinJ
 
第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し
Wataru Shito
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
Wataru Shito
 
MLflow with R
MLflow with R
Databricks
 
20180806_座学(Lightning Flow)
20180806_座学(Lightning Flow)
Junko Nakayama
 
Connectix webserver
Connectix webserver
steveheer
 
Connectix webserver
Connectix webserver
steveheer
 
R Programming: Transform/Reshape Data In R
R Programming: Transform/Reshape Data In R
Rsquared Academy
 
Introduction to tibbles
Introduction to tibbles
Rsquared Academy
 
Media Mixer semantic technologies for UGC copyright management por Roberto Ga...
Media Mixer semantic technologies for UGC copyright management por Roberto Ga...
ACTUONDA
 
Overview of APEC Region Wine Trade 2011
Overview of APEC Region Wine Trade 2011
Asian Food Regulation Information Service
 
Introduction to R
Introduction to R
Stacy Irwin
 
Writing DSLs with Parslet - Wicked Good Ruby Conf
Writing DSLs with Parslet - Wicked Good Ruby Conf
Jason Garber
 
2015-10-23_wim_davis_r_slides.pptx on consumer
2015-10-23_wim_davis_r_slides.pptx on consumer
tirlukachaitanya
 
Murtaugh 2022 Appl Comp Genomics Tidyverse lecture.pptx-1.pptx
Murtaugh 2022 Appl Comp Genomics Tidyverse lecture.pptx-1.pptx
oliversen
 
Iwsm2014 extracting dependencies from software changes (thomas wetzlmaier -...
Iwsm2014 extracting dependencies from software changes (thomas wetzlmaier -...
Nesma
 
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Raman Kannan
 
Writing Readable Code with Pipes
Writing Readable Code with Pipes
Rsquared Academy
 
第3回 データフレームの基本操作 その1(解答付き)
第3回 データフレームの基本操作 その1(解答付き)
Wataru Shito
 
MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709
Min-hyung Kim
 
Data manipulation and visualization in r 20190711 myanmarucsy
Data manipulation and visualization in r 20190711 myanmarucsy
SmartHinJ
 
第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し
Wataru Shito
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
Wataru Shito
 
20180806_座学(Lightning Flow)
20180806_座学(Lightning Flow)
Junko Nakayama
 
Connectix webserver
Connectix webserver
steveheer
 
Connectix webserver
Connectix webserver
steveheer
 
R Programming: Transform/Reshape Data In R
R Programming: Transform/Reshape Data In R
Rsquared Academy
 
Media Mixer semantic technologies for UGC copyright management por Roberto Ga...
Media Mixer semantic technologies for UGC copyright management por Roberto Ga...
ACTUONDA
 
Introduction to R
Introduction to R
Stacy Irwin
 
Writing DSLs with Parslet - Wicked Good Ruby Conf
Writing DSLs with Parslet - Wicked Good Ruby Conf
Jason Garber
 
2015-10-23_wim_davis_r_slides.pptx on consumer
2015-10-23_wim_davis_r_slides.pptx on consumer
tirlukachaitanya
 
Murtaugh 2022 Appl Comp Genomics Tidyverse lecture.pptx-1.pptx
Murtaugh 2022 Appl Comp Genomics Tidyverse lecture.pptx-1.pptx
oliversen
 
Iwsm2014 extracting dependencies from software changes (thomas wetzlmaier -...
Iwsm2014 extracting dependencies from software changes (thomas wetzlmaier -...
Nesma
 
Ad

More from Rsquared Academy (20)

Handling Date & Time in R
Handling Date & Time in R
Rsquared Academy
 
Market Basket Analysis in R
Market Basket Analysis in R
Rsquared Academy
 
Joining Data with dplyr
Joining Data with dplyr
Rsquared Academy
 
Explore Data using dplyr
Explore Data using dplyr
Rsquared Academy
 
Data Wrangling with dplyr
Data Wrangling with dplyr
Rsquared Academy
 
Read data from Excel spreadsheets into R
Read data from Excel spreadsheets into R
Rsquared Academy
 
Read/Import data from flat/delimited files into R
Read/Import data from flat/delimited files into R
Rsquared Academy
 
Variables & Data Types in R
Variables & Data Types in R
Rsquared Academy
 
How to install & update R packages?
How to install & update R packages?
Rsquared Academy
 
How to get help in R?
How to get help in R?
Rsquared Academy
 
Introduction to R
Introduction to R
Rsquared Academy
 
RMySQL Tutorial For Beginners
RMySQL Tutorial For Beginners
Rsquared Academy
 
R Markdown Tutorial For Beginners
R Markdown Tutorial For Beginners
Rsquared Academy
 
R Data Visualization Tutorial: Bar Plots
R Data Visualization Tutorial: Bar Plots
Rsquared Academy
 
R Programming: Introduction to Matrices
R Programming: Introduction to Matrices
Rsquared Academy
 
R Programming: Introduction to Vectors
R Programming: Introduction to Vectors
Rsquared Academy
 
R Programming: Variables & Data Types
R Programming: Variables & Data Types
Rsquared Academy
 
Data Visualization With R: Learn To Combine Multiple Graphs
Data Visualization With R: Learn To Combine Multiple Graphs
Rsquared Academy
 
R Data Visualization: Learn To Add Text Annotations To Plots
R Data Visualization: Learn To Add Text Annotations To Plots
Rsquared Academy
 
Data Visualization With R: Learn To Modify Font Of Graphical Parameters
Data Visualization With R: Learn To Modify Font Of Graphical Parameters
Rsquared Academy
 
Market Basket Analysis in R
Market Basket Analysis in R
Rsquared Academy
 
Read data from Excel spreadsheets into R
Read data from Excel spreadsheets into R
Rsquared Academy
 
Read/Import data from flat/delimited files into R
Read/Import data from flat/delimited files into R
Rsquared Academy
 
Variables & Data Types in R
Variables & Data Types in R
Rsquared Academy
 
How to install & update R packages?
How to install & update R packages?
Rsquared Academy
 
RMySQL Tutorial For Beginners
RMySQL Tutorial For Beginners
Rsquared Academy
 
R Markdown Tutorial For Beginners
R Markdown Tutorial For Beginners
Rsquared Academy
 
R Data Visualization Tutorial: Bar Plots
R Data Visualization Tutorial: Bar Plots
Rsquared Academy
 
R Programming: Introduction to Matrices
R Programming: Introduction to Matrices
Rsquared Academy
 
R Programming: Introduction to Vectors
R Programming: Introduction to Vectors
Rsquared Academy
 
R Programming: Variables & Data Types
R Programming: Variables & Data Types
Rsquared Academy
 
Data Visualization With R: Learn To Combine Multiple Graphs
Data Visualization With R: Learn To Combine Multiple Graphs
Rsquared Academy
 
R Data Visualization: Learn To Add Text Annotations To Plots
R Data Visualization: Learn To Add Text Annotations To Plots
Rsquared Academy
 
Data Visualization With R: Learn To Modify Font Of Graphical Parameters
Data Visualization With R: Learn To Modify Font Of Graphical Parameters
Rsquared Academy
 
Ad

Recently uploaded (20)

PPT2 W1L2.pptx.........................................
PPT2 W1L2.pptx.........................................
palicteronalyn26
 
Presentation by Tariq & Mohammed (1).pptx
Presentation by Tariq & Mohammed (1).pptx
AbooddSandoqaa
 
The Influence off Flexible Work Policies
The Influence off Flexible Work Policies
sales480687
 
ppt somu_Jarvis_AI_Assistant_presen.pptx
ppt somu_Jarvis_AI_Assistant_presen.pptx
MohammedumarFarhan
 
624753984-Annex-A3-RPMS-Tool-for-Proficient-Teachers-SY-2024-2025.pdf
624753984-Annex-A3-RPMS-Tool-for-Proficient-Teachers-SY-2024-2025.pdf
CristineGraceAcuyan
 
Residential Zone 4 for industrial village
Residential Zone 4 for industrial village
MdYasinArafat13
 
NASA ESE Study Results v4 05.29.2020.pptx
NASA ESE Study Results v4 05.29.2020.pptx
CiroAlejandroCamacho
 
Microsoft Power BI - Advanced Certificate for Business Intelligence using Pow...
Microsoft Power BI - Advanced Certificate for Business Intelligence using Pow...
Prasenjit Debnath
 
最新版美国佐治亚大学毕业证(UGA毕业证书)原版定制
最新版美国佐治亚大学毕业证(UGA毕业证书)原版定制
Taqyea
 
最新版意大利米兰大学毕业证(UNIMI毕业证书)原版定制
最新版意大利米兰大学毕业证(UNIMI毕业证书)原版定制
taqyea
 
11_L2_Defects_and_Trouble_Shooting_2014[1].pdf
11_L2_Defects_and_Trouble_Shooting_2014[1].pdf
gun3awan88
 
RESEARCH-FINAL-GROUP-3, about the final .pptx
RESEARCH-FINAL-GROUP-3, about the final .pptx
gwapokoha1
 
Model Evaluation & Visualisation part of a series of intro modules for data ...
Model Evaluation & Visualisation part of a series of intro modules for data ...
brandonlee626749
 
Starbucks in the Indian market through its joint venture.
Starbucks in the Indian market through its joint venture.
sales480687
 
Informatics Market Insights AI Workforce.pdf
Informatics Market Insights AI Workforce.pdf
karizaroxx
 
presentation4.pdf Intro to mcmc methodss
presentation4.pdf Intro to mcmc methodss
SergeyTsygankov6
 
25 items quiz for practical research 1 in grade 11
25 items quiz for practical research 1 in grade 11
leamaydayaganon81
 
最新版美国芝加哥大学毕业证(UChicago毕业证书)原版定制
最新版美国芝加哥大学毕业证(UChicago毕业证书)原版定制
taqyea
 
UPS and Big Data intro to Business Analytics.pptx
UPS and Big Data intro to Business Analytics.pptx
sanjum5582
 
Flextronics Employee Safety Data-Project-2.pptx
Flextronics Employee Safety Data-Project-2.pptx
kilarihemadri
 
PPT2 W1L2.pptx.........................................
PPT2 W1L2.pptx.........................................
palicteronalyn26
 
Presentation by Tariq & Mohammed (1).pptx
Presentation by Tariq & Mohammed (1).pptx
AbooddSandoqaa
 
The Influence off Flexible Work Policies
The Influence off Flexible Work Policies
sales480687
 
ppt somu_Jarvis_AI_Assistant_presen.pptx
ppt somu_Jarvis_AI_Assistant_presen.pptx
MohammedumarFarhan
 
624753984-Annex-A3-RPMS-Tool-for-Proficient-Teachers-SY-2024-2025.pdf
624753984-Annex-A3-RPMS-Tool-for-Proficient-Teachers-SY-2024-2025.pdf
CristineGraceAcuyan
 
Residential Zone 4 for industrial village
Residential Zone 4 for industrial village
MdYasinArafat13
 
NASA ESE Study Results v4 05.29.2020.pptx
NASA ESE Study Results v4 05.29.2020.pptx
CiroAlejandroCamacho
 
Microsoft Power BI - Advanced Certificate for Business Intelligence using Pow...
Microsoft Power BI - Advanced Certificate for Business Intelligence using Pow...
Prasenjit Debnath
 
最新版美国佐治亚大学毕业证(UGA毕业证书)原版定制
最新版美国佐治亚大学毕业证(UGA毕业证书)原版定制
Taqyea
 
最新版意大利米兰大学毕业证(UNIMI毕业证书)原版定制
最新版意大利米兰大学毕业证(UNIMI毕业证书)原版定制
taqyea
 
11_L2_Defects_and_Trouble_Shooting_2014[1].pdf
11_L2_Defects_and_Trouble_Shooting_2014[1].pdf
gun3awan88
 
RESEARCH-FINAL-GROUP-3, about the final .pptx
RESEARCH-FINAL-GROUP-3, about the final .pptx
gwapokoha1
 
Model Evaluation & Visualisation part of a series of intro modules for data ...
Model Evaluation & Visualisation part of a series of intro modules for data ...
brandonlee626749
 
Starbucks in the Indian market through its joint venture.
Starbucks in the Indian market through its joint venture.
sales480687
 
Informatics Market Insights AI Workforce.pdf
Informatics Market Insights AI Workforce.pdf
karizaroxx
 
presentation4.pdf Intro to mcmc methodss
presentation4.pdf Intro to mcmc methodss
SergeyTsygankov6
 
25 items quiz for practical research 1 in grade 11
25 items quiz for practical research 1 in grade 11
leamaydayaganon81
 
最新版美国芝加哥大学毕业证(UChicago毕业证书)原版定制
最新版美国芝加哥大学毕业证(UChicago毕业证书)原版定制
taqyea
 
UPS and Big Data intro to Business Analytics.pptx
UPS and Big Data intro to Business Analytics.pptx
sanjum5582
 
Flextronics Employee Safety Data-Project-2.pptx
Flextronics Employee Safety Data-Project-2.pptx
kilarihemadri
 

Practical Introduction to Web scraping using R

  • 1. 1
  • 2. Connect With Us Website ( ) Free Online R Courses ( ) R Packages ( ) Shiny Apps ( ) Blog ( ) GitHub ( ) YouTube ( ) Twitter ( ) Facebook ( ) Linkedin ( ) • https://www.rsquaredacademy.com/ • https://rsquared-academy.thinkific.com/ • https://pkgs.rsquaredacademy.com • https://apps.rsquaredacademy.com • https://blog.rsquaredacademy.com • https://github.com/rsquaredacademy • https://www.youtube.com/user/rsquaredin/ • https://twitter.com/rsquaredacademy • https://www.facebook.com/rsquaredacademy/ • https://in.linkedin.com/company/rsquared-academy 2
  • 3. what? why? how? use cases HTML basics case studies • • • • • • 3
  • 4. 4
  • 5. 5
  • 6. 6
  • 7. 7
  • 8. 8
  • 9. 9
  • 10. 10
  • 11. 11
  • 12. 12
  • 13. 13
  • 14. 14
  • 15. 15
  • 16. 16
  • 17. 17
  • 18. 18
  • 19. 19
  • 21. 21
  • 23. Read Web Page imdb <- read_html("https://www.imdb.com/search/title?groups=top_250&sort imdb ## {xml_document} ## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/ ## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars ## [2] <body id="styleguide-v2" class="fixed">nn <img heigh 23
  • 24. 24
  • 25. Title imdb %>% html_nodes(".lister-item-content h3 a") %>% html_text() -> movie_title movie_title ## [1] "The Shawshank Redemption" ## [2] "The Godfather" ## [3] "The Dark Knight" ## [4] "The Godfather: Part II" ## [5] "The Lord of the Rings: The Return of the King" ## [6] "Pulp Fiction" ## [7] "Schindler's List" ## [8] "Il buono, il brutto, il cattivo" ## [9] "12 Angry Men" ## [10] "Inception" ## [11] "Fight Club" ## [12] "The Lord of the Rings: The Fellowship of the Ring" ## [13] "Forrest Gump" ## [14] "The Lord of the Rings: The Two Towers" ## [15] "The Matrix" ## [16] "Goodfellas" ## [17] "Star Wars: Episode V - The Empire Strikes Back" 25
  • 26. 26
  • 27. Year of Release imdb %>% html_nodes(".lister-item-content h3 .lister-item-year") %>% html_text() %>% str_sub(start = 2, end = 5) %>% as.Date(format = "%Y") %>% year() -> movie_year movie_year ## [1] 1994 1972 2008 1974 2003 1994 1993 1966 1957 2010 1999 2001 1994 ## [15] 1999 1990 1980 1975 1954 2014 2002 2001 1998 1999 1997 1995 1995 ## [29] 1991 1977 1946 2018 2016 2018 2018 2014 2011 2006 2006 2002 2000 ## [43] 1998 1994 1991 1988 1988 1985 1981 1979 27
  • 28. 28
  • 29. Certificate imdb %>% html_nodes(".lister-item-content p .certificate") %>% html_text() -> movie_certificate movie_certificate ## [1] "A" "A" "UA" "PG-13" "A" "A" "UA" "A" ## [9] "PG-13" "PG-13" "PG-13" "A" "A" "PG" "UA" "R" ## [17] "PG" "A" "A" "PG-13" "A" "R" "A" "A" ## [25] "U" "PG" "UA" "U" "U" "UA" "A" "UA" ## [33] "PG-13" "A" "R" "R" "R" "A" "U" "U" ## [41] "R" "U" "PG" "R" 29
  • 30. 30
  • 31. Runtime imdb %>% html_nodes(".lister-item-content p .runtime") %>% html_text() %>% str_split(" ") %>% map_chr(1) %>% as.numeric() -> movie_runtime movie_runtime ## [1] 142 175 152 202 201 154 195 161 96 148 139 178 142 179 136 146 ## [18] 133 207 169 130 125 169 189 116 106 127 110 118 121 130 139 161 ## [35] 149 106 112 130 151 150 113 155 119 88 137 155 89 116 115 147 31
  • 32. 32
  • 33. Genre imdb %>% html_nodes(".lister-item-content p .genre") %>% html_text() %>% str_trim() -> movie_genre movie_genre ## [1] "Drama" "Crime, Drama" ## [3] "Action, Crime, Drama" "Crime, Drama" ## [5] "Adventure, Drama, Fantasy" "Crime, Drama" ## [7] "Biography, Drama, History" "Western" ## [9] "Drama" "Action, Adventure, Sci-Fi" ## [11] "Drama" "Adventure, Drama, Fantasy" ## [13] "Drama, Romance" "Adventure, Drama, Fantasy" ## [15] "Action, Sci-Fi" "Biography, Crime, Drama" ## [17] "Action, Adventure, Fantasy" "Drama" ## [19] "Adventure, Drama" "Adventure, Drama, Sci-Fi" ## [21] "Crime, Drama" "Animation, Adventure, Family" ## [23] "Drama, War" "Crime, Drama, Fantasy" ## [25] "Comedy, Drama, Romance" "Crime, Mystery, Thriller" ## [27] "Crime, Drama, Mystery" "Action, Crime, Drama" ## [29] "Crime, Drama, Thriller" "Action, Adventure, Fantasy" ## [31] "Drama, Family, Fantasy" "Crime, Thriller" 33
  • 34. 34
  • 35. Rating imdb %>% html_nodes(".ratings-bar .ratings-imdb-rating") %>% html_attr("data-value") %>% as.numeric() -> movie_rating movie_rating ## [1] 9.3 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7 ## [18] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5 ## [35] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 35
  • 36. 36
  • 37. 37
  • 38. Votes imdb %>% html_nodes(xpath = '//meta[@itemprop="ratingCount"]') %>% html_attr('content') %>% as.numeric() -> movie_votes movie_votes ## [1] 2072893 1422292 2038787 987020 1475650 1621033 1074273 615219 ## [9] 585562 1817393 1658750 1492209 1589127 1334563 1489071 895033 ## [17] 1040130 822277 280024 1276946 637716 549410 1096231 1000909 ## [25] 545280 897576 1271530 913352 1118817 1109777 352837 39132 ## [33] 118413 174125 617621 605417 666327 1052901 1064050 633675 ## [41] 1021511 1198326 941917 823238 897607 198398 192715 923178 ## [49] 803033 542311 38
  • 39. 39
  • 40. Revenue imdb %>% html_nodes(xpath = '//span[@name="nv"]') %>% html_text() %>% str_extract(pattern = "^$.*") %>% na.omit() %>% as.character() %>% append(values = NA, after = 30) %>% append(values = NA, after = 46) %>% str_sub(start = 2, end = nchar(.) - 1) %>% as.numeric() -> movie_revenue movie_revenue ## [1] 28.34 134.97 534.86 57.30 377.85 107.93 96.07 6.10 4.36 2 ## [11] 37.03 315.54 330.25 342.55 171.48 46.84 290.48 112.00 0.27 1 ## [21] 7.56 10.06 216.54 136.80 57.60 23.34 100.13 19.50 130.74 3 ## [31] NA 1.19 12.39 190.24 678.82 13.09 13.18 53.09 132.38 ## [41] 25.54 187.71 6.72 312.90 204.84 11.99 NA 210.61 248.16 40
  • 41. Putting it all together… top_50 <- tibble(title = movie_title, release = movie_year, `runtime (mins)` = movie_runtime, genre = movie_genre, rating = movi votes = movie_votes, `revenue ($ millions)` = movie_revenue) top_50 ## # A tibble: 50 x 7 ## title release `runtime (mins)` genre rating votes `revenue ( ## <chr> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 The Sha~ 1994 142 Drama 9.3 2.07e6 ## 2 The God~ 1972 175 Crime,~ 9.2 1.42e6 ## 3 The Dar~ 2008 152 Action~ 9 2.04e6 ## 4 The God~ 1974 202 Crime,~ 9 9.87e5 ## 5 The Lor~ 2003 201 Advent~ 8.9 1.48e6 ## 6 Pulp Fi~ 1994 154 Crime,~ 8.9 1.62e6 ## 7 Schindl~ 1993 195 Biogra~ 8.9 1.07e6 ## 8 Il buon~ 1966 161 Western 8.9 6.15e5 ## 9 12 Angr~ 1957 96 Drama 8.9 5.86e5 ## 10 Incepti~ 2010 148 Action~ 8.8 1.82e6 ## # ... with 40 more rows 41
  • 42. 42
  • 44. Read Web Page rbi_guv <- read_html("https://en.wikipedia.org/wiki/List_of_Governors_of rbi_guv ## {xml_document} ## <html class="client-nojs" lang="en" dir="ltr"> ## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars ## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns- 44
  • 45. List of Governors rbi_guv %>% html_nodes("table") %>% html_table() %>% extract2(2) -> profile profile ## No. Officeholder Portrait Term start Term ## 1 1 Osborne Smith NA 1 April 1935 30 June 1 ## 2 2 James Braid Taylor NA 1 July 1937 17 February 1 ## 3 3 C. D. Deshmukh NA 11 August 1943ii 30 May 1 ## 4 4 Benegal Rama Rau NA 1 July 1949 14 January 1 ## 5 5 K. G. Ambegaonkar NA 14 January 1957 28 February 1 ## 6 6 H. V. R. Iyengar NA 1 March 1957 28 February 1 ## 7 7 P. C. Bhattacharya NA 1 March 1962 30 June 1 ## 8 8 Lakshmi Kant Jha NA 1 July 1967 3 May 1 ## 9 9 B. N. Adarkar NA 4 May 1970 15 June 1 ## 10 10 Sarukkai Jagannathan NA 16 June 1970 19 May 1 ## 11 11 N. C. Sen Gupta NA 19 May 1975 19 August 1 ## 12 12 K. R. Puri NA 20 August 1975 2 May 1 ## 13 13 M. Narasimham NA 3 May 1977 30 November 1 ## 14 14 I. G. Patel NA 1 December 1977 15 September 1 ## 15 15 Manmohan Singh NA 16 September 1982 14 January 1 45
  • 46. Sort profile %>% separate(`Term in office`, into = c("term", "days")) %>% select(Officeholder, term) %>% arrange(desc(as.numeric(term))) ## Officeholder term ## 1 Benegal Rama Rau 2754 ## 2 C. D. Deshmukh 2150 ## 3 R. N. Malhotra 2147 ## 4 Bimal Jalan 2114 ## 5 James Braid Taylor 2057 ## 6 P. C. Bhattacharya 1947 ## 7 Y. Venugopal Reddy 1826 ## 8 H. V. R. Iyengar 1825 ## 9 D. Subbarao 1825 ## 10 Sarukkai Jagannathan 1798 ## 11 C. Rangarajan 1795 ## 12 I. G. Patel 1749 ## 13 Raghuram Rajan 1096 ## 14 Lakshmi Kant Jha 1037 ## 15 Urjit Patel 947 ## 16 Manmohan Singh 851 46
  • 47. Backgrounds profile %>% count(Background) ## # A tibble: 9 x 2 ## Background n ## <chr> <int> ## 1 "" 1 ## 2 Banker 2 ## 3 Career Reserve Bank of India officer 1 ## 4 Economist 7 ## 5 IAS officer 4 ## 6 ICS officer 7 ## 7 Indian Administrative Service (IAS) officer 1 ## 8 Indian Audit and Accounts Service officer 1 ## 9 Indian Civil Service (ICS) officer 1 47
  • 48. Backgrounds profile %>% pull(Background) %>% fct_collapse( Bureaucrats = c("IAS officer", "ICS officer", "Indian Administrative Service (IAS) officer", "Indian Audit and Accounts Service officer", "Indian Civil Service (ICS) officer"), `No Info` = c(""), `RBI Officer` = c("Career Reserve Bank of India officer") ) %>% fct_count() %>% rename(background = f, count = n) -> backgrounds 48
  • 49. Backgrounds backgrounds ## # A tibble: 5 x 2 ## background count ## <fct> <int> ## 1 No Info 1 ## 2 Banker 2 ## 3 RBI Officer 1 ## 4 Economist 7 ## 5 Bureaucrats 14 49
  • 50. Backgrounds backgrounds %>% ggplot() + geom_col(aes(background, count), fill = "blue") + xlab("Background") + ylab("Count") + ggtitle("Background of RBI Governors") 50
  • 51. 51
  • 52. Summary web scraping is the extraction of data from web sites best for static & well structured HTML pages review robots.txt file HTML code can change any time if API is available, please use it do not overwhelm websites with requests • • • • • • 52
  • 53. 53