Description
Load WARC Files into Apache Spark.
Description
Load WARC (Web ARChive) files into Apache Spark using 'sparklyr'. This allows to read files from the Common Crawl project <http://commoncrawl.org/>.
README.md
sparkwarc - WARC files in sparklyr
Install
Install using with:
devtools::install_github("javierluraschi/sparkwarc")
Intro
The following example loads a very small subset of a WARC file from Common Crawl, a nonprofit 501 organization that crawls the web and freely provides its archives and datasets to the public.
library(sparkwarc)
library(sparklyr)
library(DBI)
library(dplyr)
sc <- spark_connect(master = "local")
## * Using Spark: 2.1.0
spark_read_warc(
sc,
"warc",
system.file("samples/sample.warc.gz", package = "sparkwarc"),
repartition = 8)
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<html', 0)) > 0
count(value) |
---|
6 |
cc_regex <- function(ops) {
ops %>%
filter(regval != "") %>%
group_by(regval) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(100)
}
cc_stats <- function(regex) {
tbl(sc, "warc") %>%
transmute(regval = regexp_extract(value, regex, 1)) %>%
cc_regex()
}
cc_stats("http-equiv=\"Content-Language\" content=\"(.*)\"")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 ru-RU 5
cc_stats("<script .*src=\".*/(.+)\".*")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 08.js 5
## 2 ga.js 5
## 3 jquery.formtips.1.2.2.packed.js 5
## 4 jquery-ui-1.7.2.custom.min.js 5
## 5 jquery-1.4.2.min.js 5
## 6 start.js 5
## 7 jquery.equalHeight.js 5
## 8 lytebox.js 5
## 9 plusone.js 5
cc_stats("<([a-zA-Z]+)>")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 li 53
## 2 span 26
## 3 th 18
## 4 p 17
## 5 ul 16
## 6 tr 13
## 7 strong 7
## 8 title 6
## 9 body 6
## 10 head 6
## 11 div 6
## 12 noscript 5
## 13 table 3
## 14 td 3
## 15 br 1
## 16 style 1
cc_stats(" ([a-zA-Z]{5,10}) ")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 counter 10
## 2 PUBLIC 6
## 3 return 6
## 4 Banners 5
## 5 widget 5
## 6 function 5
## 7 Banner 5
## 8 solid 2
## 9 Nutch 1
## 10 Domain 1
## 11 visit 1
## 12 crawl 1
## 13 Registry 1
## 14 Parked 1
## 15 Format 1
## 16 priceUAH 1
## 17 domain 1
cc_stats("<meta .*keywords.*content=\"([^,\"]+).*")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 Лес 1
## 2 Вип Степ 1
## 3 domain names 1
## 4 Регистрация-ликвидация предприятий 1
## 5 Свобода 1
## 6 Foxy 1
cc_stats("<script .*src=\".*/([^/]+.js)\".*")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 jquery.formtips.1.2.2.packed.js 5
## 2 08.js 5
## 3 ga.js 5
## 4 jquery.equalHeight.js 5
## 5 lytebox.js 5
## 6 plusone.js 5
## 7 jquery-ui-1.7.2.custom.min.js 5
## 8 jquery-1.4.2.min.js 5
## 9 start.js 5
spark_disconnect(sc)
Querying 1GB
warc_big <- normalizePath("~/cc.warc.gz") # Name a 5GB warc file
if (!file.exists(warc_big)) # If the file does not exist
download.file( # download by
gsub("s3n://commoncrawl/", # mapping the S3 bucket url
"https://commoncrawl.s3.amazonaws.com/", # into a adownloadable url
sparkwarc::cc_warc(1)), warc_big) # from the first archive file
config <- spark_config()
config[["spark.memory.fraction"]] <- "0.9"
config[["spark.executor.memory"]] <- "10G"
config[["sparklyr.shell.driver-memory"]] <- "10G"
sc <- spark_connect(master = "local", config = config)
## * Using Spark: 2.1.0
spark_read_warc(
sc,
"warc",
warc_big,
repartition = 8)
df <- data.frame(list(a = list("a,b,c")))
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<([a-z]+)>', 0)) > 0
count(value) |
---|
6336761 |
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<html', 0)) > 0
count(value) |
---|
74519 |
cc_stats("http-equiv=\"Content-Language\" content=\"([^\"]*)\"")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 en 533
## 2 en-us 323
## 3 ru 150
## 4 es 127
## 5 en-US 105
## 6 fr 95
## 7 de 92
## 8 pl 71
## 9 cs 48
## 10 ja 45
## # ... with 90 more rows
cc_stats("WARC-Target-URI: http://([^/]+)/.*")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 www.urbandictionary.com 156
## 2 my-shop.ru 69
## 3 hfboards.hockeysfuture.com 69
## 4 www.greatlakes4x4.com 66
## 5 www.opensecrets.org 60
## 6 www.summitpost.org 57
## 7 brainly.com.br 57
## 8 www.mobileread.com 54
## 9 www.genealogy.com 54
## 10 shop.ccs.com 51
## # ... with 90 more rows
cc_stats("<([a-zA-Z]+)>")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 li 2492324
## 2 span 506471
## 3 tr 440658
## 4 p 432221
## 5 td 398106
## 6 ul 258962
## 7 div 211937
## 8 script 198504
## 9 br 196993
## 10 strong 152675
## # ... with 90 more rows
cc_stats("<meta .*keywords.*content=\"([a-zA-Z0-9]+).*")
## # Source: lazy query [?? x 2]
## # Database: spark_connection
## # Ordered by: desc(count)
## regval count
## <chr> <dbl>
## 1 width 285
## 2 http 235
## 3 free 110
## 4 text 110
## 5 The 100
## 6 index 91
## 7 https 85
## 8 SKYPE 59
## 9 1 55
## 10 news 48
## # ... with 90 more rows
spark_disconnect(sc)
Querying 1TB
By running sparklyr in EMR, one can configure an EMR cluster and load about ~5GB of data using:
sc <- spark_connect(master = "yarn-client")
spark_read_warc(sc, "warc", cc_warc(1, 1))
tbl(sc, "warc") %>% summarize(n = n())
spark_disconnect_all()
To read the first 200 files, or about ~1TB of data, first scale the cluster, consider maximizing resource allocation with the followin EMR config:
[
{
"Classification": "spark",
"Properties": {
"maximizeResourceAllocation": "true"
}
}
]
Followed by loading the [1, 200]
file range with:
sc <- spark_connect(master = "yarn-client")
spark_read_warc(sc, "warc", cc_warc(1, 200))
tbl(sc, "warc") %>% summarize(n = n())
spark_disconnect_all()
To query ~1PB for the entire crawl, a custom script would be needed to load all the WARC files.