MyNixOS website logo
Description

Load WARC Files into Apache Spark.

Load WARC (Web ARChive) files into Apache Spark using 'sparklyr'. This allows to read files from the Common Crawl project <http://commoncrawl.org/>.

sparkwarc - WARC files in sparklyr

Install

Install using with:

devtools::install_github("javierluraschi/sparkwarc")

Intro

The following example loads a very small subset of a WARC file from Common Crawl, a nonprofit 501 organization that crawls the web and freely provides its archives and datasets to the public.

library(sparkwarc)
library(sparklyr)
library(DBI)
library(dplyr)
sc <- spark_connect(master = "local")
## * Using Spark: 2.1.0
spark_read_warc(
  sc,
  "warc",
  system.file("samples/sample.warc.gz", package = "sparkwarc"),
  repartition = 8)
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<html', 0)) > 0
count(value)
6
cc_regex <- function(ops) {
  ops %>%
    filter(regval != "") %>%
    group_by(regval) %>%
    summarize(count = n()) %>%
    arrange(desc(count)) %>%
    head(100)
}

cc_stats <- function(regex) {
  tbl(sc, "warc") %>%
    transmute(regval = regexp_extract(value, regex, 1)) %>%
    cc_regex()
}
cc_stats("http-equiv=\"Content-Language\" content=\"(.*)\"")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##   regval count
##    <chr> <dbl>
## 1  ru-RU     5
cc_stats("<script .*src=\".*/(.+)\".*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                            regval count
##                             <chr> <dbl>
## 1                           08.js     5
## 2                           ga.js     5
## 3 jquery.formtips.1.2.2.packed.js     5
## 4   jquery-ui-1.7.2.custom.min.js     5
## 5             jquery-1.4.2.min.js     5
## 6                        start.js     5
## 7           jquery.equalHeight.js     5
## 8                      lytebox.js     5
## 9                      plusone.js     5
cc_stats("<([a-zA-Z]+)>")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##      regval count
##       <chr> <dbl>
##  1       li    53
##  2     span    26
##  3       th    18
##  4        p    17
##  5       ul    16
##  6       tr    13
##  7   strong     7
##  8    title     6
##  9     body     6
## 10     head     6
## 11      div     6
## 12 noscript     5
## 13    table     3
## 14       td     3
## 15       br     1
## 16    style     1
cc_stats(" ([a-zA-Z]{5,10}) ")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##      regval count
##       <chr> <dbl>
##  1  counter    10
##  2   PUBLIC     6
##  3   return     6
##  4  Banners     5
##  5   widget     5
##  6 function     5
##  7   Banner     5
##  8    solid     2
##  9    Nutch     1
## 10   Domain     1
## 11    visit     1
## 12    crawl     1
## 13 Registry     1
## 14   Parked     1
## 15   Format     1
## 16 priceUAH     1
## 17   domain     1
cc_stats("<meta .*keywords.*content=\"([^,\"]+).*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                               regval count
##                                <chr> <dbl>
## 1                                Лес     1
## 2                           Вип Степ     1
## 3                       domain names     1
## 4 Регистрация-ликвидация предприятий     1
## 5                            Свобода     1
## 6                               Foxy     1
cc_stats("<script .*src=\".*/([^/]+.js)\".*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                            regval count
##                             <chr> <dbl>
## 1 jquery.formtips.1.2.2.packed.js     5
## 2                           08.js     5
## 3                           ga.js     5
## 4           jquery.equalHeight.js     5
## 5                      lytebox.js     5
## 6                      plusone.js     5
## 7   jquery-ui-1.7.2.custom.min.js     5
## 8             jquery-1.4.2.min.js     5
## 9                        start.js     5
spark_disconnect(sc)

Querying 1GB

warc_big <- normalizePath("~/cc.warc.gz")           # Name a 5GB warc file
if (!file.exists(warc_big))                         # If the file does not exist
  download.file(                                    # download by
    gsub("s3n://commoncrawl/",                      # mapping the S3 bucket url
         "https://commoncrawl.s3.amazonaws.com/",   # into a adownloadable url
         sparkwarc::cc_warc(1)), warc_big)          # from the first archive file
config <- spark_config()
config[["spark.memory.fraction"]] <- "0.9"
config[["spark.executor.memory"]] <- "10G"
config[["sparklyr.shell.driver-memory"]] <- "10G"

sc <- spark_connect(master = "local", config = config)
## * Using Spark: 2.1.0
spark_read_warc(
  sc,
  "warc",
  warc_big,
  repartition = 8)

df <- data.frame(list(a = list("a,b,c")))

SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<([a-z]+)>', 0)) > 0
count(value)
6336761
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<html', 0)) > 0
count(value)
74519
cc_stats("http-equiv=\"Content-Language\" content=\"([^\"]*)\"")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##    regval count
##     <chr> <dbl>
##  1     en   533
##  2  en-us   323
##  3     ru   150
##  4     es   127
##  5  en-US   105
##  6     fr    95
##  7     de    92
##  8     pl    71
##  9     cs    48
## 10     ja    45
## # ... with 90 more rows
cc_stats("WARC-Target-URI: http://([^/]+)/.*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                        regval count
##                         <chr> <dbl>
##  1    www.urbandictionary.com   156
##  2                 my-shop.ru    69
##  3 hfboards.hockeysfuture.com    69
##  4      www.greatlakes4x4.com    66
##  5        www.opensecrets.org    60
##  6         www.summitpost.org    57
##  7             brainly.com.br    57
##  8         www.mobileread.com    54
##  9          www.genealogy.com    54
## 10               shop.ccs.com    51
## # ... with 90 more rows
cc_stats("<([a-zA-Z]+)>")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##    regval   count
##     <chr>   <dbl>
##  1     li 2492324
##  2   span  506471
##  3     tr  440658
##  4      p  432221
##  5     td  398106
##  6     ul  258962
##  7    div  211937
##  8 script  198504
##  9     br  196993
## 10 strong  152675
## # ... with 90 more rows
cc_stats("<meta .*keywords.*content=\"([a-zA-Z0-9]+).*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##    regval count
##     <chr> <dbl>
##  1  width   285
##  2   http   235
##  3   free   110
##  4   text   110
##  5    The   100
##  6  index    91
##  7  https    85
##  8  SKYPE    59
##  9      1    55
## 10   news    48
## # ... with 90 more rows
spark_disconnect(sc)

Querying 1TB

By running sparklyr in EMR, one can configure an EMR cluster and load about ~5GB of data using:

sc <- spark_connect(master = "yarn-client")
spark_read_warc(sc, "warc", cc_warc(1, 1))

tbl(sc, "warc") %>% summarize(n = n())
spark_disconnect_all()

To read the first 200 files, or about ~1TB of data, first scale the cluster, consider maximizing resource allocation with the followin EMR config:

[
  {
    "Classification": "spark",
    "Properties": {
      "maximizeResourceAllocation": "true"
    }
  }
]

Followed by loading the [1, 200] file range with:

sc <- spark_connect(master = "yarn-client")
spark_read_warc(sc, "warc", cc_warc(1, 200))

tbl(sc, "warc") %>% summarize(n = n())
spark_disconnect_all()

To query ~1PB for the entire crawl, a custom script would be needed to load all the WARC files.

Metadata

Version

0.1.6

License

Unknown

Platforms (77)

    Darwin
    FreeBSD
    Genode
    GHCJS
    Linux
    MMIXware
    NetBSD
    none
    OpenBSD
    Redox
    Solaris
    WASI
    Windows
Show all
  • aarch64-darwin
  • aarch64-freebsd
  • aarch64-genode
  • aarch64-linux
  • aarch64-netbsd
  • aarch64-none
  • aarch64-windows
  • aarch64_be-none
  • arm-none
  • armv5tel-linux
  • armv6l-linux
  • armv6l-netbsd
  • armv6l-none
  • armv7a-darwin
  • armv7a-linux
  • armv7a-netbsd
  • armv7l-linux
  • armv7l-netbsd
  • avr-none
  • i686-cygwin
  • i686-darwin
  • i686-freebsd
  • i686-genode
  • i686-linux
  • i686-netbsd
  • i686-none
  • i686-openbsd
  • i686-windows
  • javascript-ghcjs
  • loongarch64-linux
  • m68k-linux
  • m68k-netbsd
  • m68k-none
  • microblaze-linux
  • microblaze-none
  • microblazeel-linux
  • microblazeel-none
  • mips-linux
  • mips-none
  • mips64-linux
  • mips64-none
  • mips64el-linux
  • mipsel-linux
  • mipsel-netbsd
  • mmix-mmixware
  • msp430-none
  • or1k-none
  • powerpc-netbsd
  • powerpc-none
  • powerpc64-linux
  • powerpc64le-linux
  • powerpcle-none
  • riscv32-linux
  • riscv32-netbsd
  • riscv32-none
  • riscv64-linux
  • riscv64-netbsd
  • riscv64-none
  • rx-none
  • s390-linux
  • s390-none
  • s390x-linux
  • s390x-none
  • vc4-none
  • wasm32-wasi
  • wasm64-wasi
  • x86_64-cygwin
  • x86_64-darwin
  • x86_64-freebsd
  • x86_64-genode
  • x86_64-linux
  • x86_64-netbsd
  • x86_64-none
  • x86_64-openbsd
  • x86_64-redox
  • x86_64-solaris
  • x86_64-windows