MyNixOS website logo
Description

Fast Simple URL Parser.

A fast and simple 'URL' parser package for 'R'. This package provides functions to parse 'URLs' into their components, such as scheme, user, password, host, port, path, query, and fragment.

urlparse

CRANstatus R-CMD-check Codecov testcoverage urlparse statusbadge

Fast and simple url parser for R. Initially developed for the paws.common package.

urlparse::url_parse("https://user:[email protected]:8000/path?query=1#fragment")
#> $scheme
#> [1] "https"
#> 
#> $user
#> [1] "user"
#> 
#> $password
#> [1] "pass"
#> 
#> $host
#> [1] "host.com"
#> 
#> $port
#> [1] "8000"
#> 
#> $path
#> [1] "/path"
#> 
#> $raw_path
#> [1] ""
#> 
#> $query
#> $query$query
#> [1] "1"
#> 
#> 
#> $raw_query
#> [1] "query=1"
#> 
#> $fragment
#> [1] "fragment"

Installation

You can install the development version of urlparse like so:

remotes::install_github("dyfanjones/urlparse")

r-universe installation:

install.packages("urlparse", repos = c("https://dyfanjones.r-universe.dev", "https://cloud.r-project.org"))

Example

This is a basic example which shows you how to solve a common problem:

library(urlparse)
url_encoder("foo = bar + 5")
#> [1] "foo%20%3D%20bar%20%2B%205"

url_decoder(url_encoder("foo = bar + 5"))
#> [1] "foo = bar + 5"

Similar to python’s from urllib.parse import quote, urlparse::url_encoder supports the safe parameter. The additional ASCII characters that should not be encoded.

from urllib.parse import quote
quote("foo = bar + 5", safe = "+")
#> 'foo%20%3D%20bar%20+%205'
url_encoder("foo = bar + 5", safe = "+")
#> [1] "foo%20%3D%20bar%20+%205"

Modify an url through piping using the set_* functions or using the stand alone url_modify function.


url <- "http://example.com"
set_scheme(url, "https") |>
  set_port(1234L) |>
  set_path("foo/bar") |>
  set_query("baz") |>
  set_fragment("quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"

url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"

Note: it is faster to use url_modify rather than piping the set_* functions. This is because urlparse has to parse the url within each set_* to modify the url.

url <- "http://example.com"
bench::mark(
  piping = {set_scheme(url, "https") |>
  set_port(1234L) |>
  set_path("foo/bar") |>
  set_query("baz") |>
  set_fragment("quux")},
  single_function = url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
)
#> # A tibble: 2 × 6
#>   expression           min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 piping            5.29µs    5.9µs   162854.        0B     16.3
#> 2 single_function    1.6µs    1.8µs   517151.        0B     51.7

Benchmark:

Parsing URL:

url <- "https://user:[email protected]:8000/path?query=1#fragment"
(bm <- bench::mark(
  urlparse = urlparse::url_parse(url),
  httr2 = httr2::url_parse(url),
  curl = curl::curl_parse_url(url),
  urltools = urltools::url_parse(url),
  check = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     1.72µs   1.93µs   488712.        0B      0  
#> 2 httr2       22.39µs  23.86µs    40406.  571.07KB     24.3
#> 3 curl        27.06µs  29.64µs    29324.        0B     14.7
#> 4 urltools   124.44µs 132.68µs     7119.    2.18MB     23.2

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse     1      1       68.6        NaN      NaN
#> 2 httr2       13.0   12.4      5.68       Inf      Inf
#> 3 curl        15.7   15.4      4.12       NaN      Inf
#> 4 urltools    72.3   68.9      1          Inf      Inf

ggplot2::autoplot(bm)

Since urlpase v0.1.999+ you can use the vectorised url parser url_parser_v2

urls <- c(
  "https://www.example.com",
  "https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519",
  "https://user_1:[email protected]:8080/dir/../api?q=1#frag",
  "https://user:[email protected]",
  "https://www.example.com:8080/search%3D1%2B3",
  "https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4",
  "https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry",
  "https://user:[email protected]:8080",
  "https://user:[email protected]",
  "https://[email protected]:8080",
  "https://[email protected]"
)
(bm <- bench::mark(
  urlparse = lapply(urls, urlparse::url_parse),
  urlparse_v2 = urlparse::url_parse_v2(urls),
  httr2 =  lapply(urls, httr2::url_parse),
  curl = lapply(urls, curl::curl_parse_url),
  urltools = urltools::url_parse(urls),
  check = F
))
#> # A tibble: 5 × 6
#>   expression       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse      19.4µs   21.3µs    46214.      200B    13.9 
#> 2 urlparse_v2   10.5µs     11µs    87963.     4.3KB     0   
#> 3 httr2        171.6µs  181.6µs     5232.        0B    10.2 
#> 4 curl         188.7µs  198.4µs     4895.        0B     8.14
#> 5 urltools       130µs  142.1µs     6569.        0B    10.2

show_relative(bm)
#> # A tibble: 5 × 6
#>   expression    min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse     1.85   1.94      9.44       Inf      Inf
#> 2 urlparse_v2  1      1        18.0        Inf      NaN
#> 3 httr2       16.4   16.5       1.07       NaN      Inf
#> 4 curl        18.0   18.1       1          NaN      Inf
#> 5 urltools    12.4   12.9       1.34       NaN      Inf

ggplot2::autoplot(bm)

Note: url_parse_v2 returns the parsed url as a data.frame this is similar behaviour to urltools and adaR:

urlparse::url_parse_v2(urls)
#>                                                                                                                                                                       href
#> 1                                                                                                                                                  https://www.example.com
#> 2  https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3                                                                                                           https://user_1:[email protected]:8080/dir/../api?q=1#frag
#> 4                                                                                                                                        https://user:[email protected]
#> 5                                                                                                                              https://www.example.com:8080/search%3D1%2B3
#> 6                                                                                                                                 https://www.google.co.jp/search?q=ドイツ
#> 7                                                                                                         https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry
#> 8                                                                                                                                   https://user:[email protected]:8080
#> 9                                                                                                                                        https://user:[email protected]
#> 10                                                                                                                                           https://[email protected]:8080
#> 11                                                                                                                                                https://[email protected]
#>    scheme   user   password             host port
#> 1   https                    www.example.com     
#> 2   https                     www.google.com     
#> 3   https user_1 password_1      example.org 8080
#> 4   https   user   password      example.com     
#> 5   https                    www.example.com 8080
#> 6   https                   www.google.co.jp     
#> 7   https                    www.example.com 8080
#> 8   https   user   password      example.com 8080
#> 9   https   user   password      example.com     
#> 10  https   user                 example.com 8080
#> 11  https   user                 example.com     
#>                                                                                                                                                 path
#> 1                                                                                                                                                   
#> 2  /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3                                                                                                                                        /dir/../api
#> 4                                                                                                                                                   
#> 5                                                                                                                                        /search=1+3
#> 6                                                                                                                                            /search
#> 7                                                                                                                                                   
#> 8                                                                                                                                                   
#> 9                                                                                                                                                   
#> 10                                                                                                                                                  
#> 11                                                                                                                                                  
#>                                                                                                                                             raw_path
#> 1                                                                                                                                                   
#> 2  /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3                                                                                                                                                   
#> 4                                                                                                                                                   
#> 5                                                                                                                                    /search%3D1%2B3
#> 6                                                                                                                                                   
#> 7                                                                                                                                                   
#> 8                                                                                                                                                   
#> 9                                                                                                                                                   
#> 10                                                                                                                                                  
#> 11                                                                                                                                                  
#>                                raw_query fragment
#> 1                                                
#> 2                                                
#> 3                                    q=1     frag
#> 4                                                
#> 5                                                
#> 6          q=%E3%83%89%E3%82%A4%E3%83%84         
#> 7  var1=foo&var2=ba%20r&var3=baz%2Blarry         
#> 8                                                
#> 9                                                
#> 10                                               
#> 11

Encoding URL:

Note: urltools encode special characters to lower case hex i.e.: “?” -> “%3f” instead of “%3F”

string <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
(bm <- bench::mark(
  urlparse = urlparse::url_encoder(string),
  curl = curl::curl_escape(string),
  urltools = urltools::url_encode(string),
  base = URLencode(string, reserved = T),
  check = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     1.48µs   1.64µs   581380.      208B     0   
#> 2 curl         2.25µs   2.58µs   349595.    3.03KB     0   
#> 3 urltools     2.34µs   2.54µs   381930.    2.48KB     0   
#> 4 base        78.84µs  82.33µs    11746.   28.59KB     8.25

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse    1      1         49.5       1        NaN
#> 2 curl        1.53   1.57      29.8      14.9      NaN
#> 3 urltools    1.58   1.55      32.5      12.2      NaN
#> 4 base       53.4   50.2        1       141.       Inf

ggplot2::autoplot(bm)
string <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
url <- paste0(sample(strsplit(string, "")[[1]], 1e4, replace = TRUE), collapse = "")
(bm <- bench::mark(
  urlparse = urlparse::url_encoder(url),
  curl = curl::curl_escape(url),
  urltools = urltools::url_encode(url),
  base = URLencode(url, reserved = T, repeated = T),
  check = F,
  filter_gc = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     86.4µs   88.3µs    10599.    15.8KB     0   
#> 2 curl         91.6µs   94.9µs    10306.        0B     0   
#> 3 urltools    241.8µs  247.8µs     3943.    15.8KB     0   
#> 4 base          6.7ms      7ms      138.   333.5KB     8.00

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse    1      1         76.8       Inf      NaN
#> 2 curl        1.06   1.08      74.7       NaN      NaN
#> 3 urltools    2.80   2.81      28.6       Inf      NaN
#> 4 base       77.6   79.3        1         Inf      Inf

ggplot2::autoplot(bm)
Metadata

Version

0.2.1

License

Unknown

Platforms (75)

    Darwin
    FreeBSD
    Genode
    GHCJS
    Linux
    MMIXware
    NetBSD
    none
    OpenBSD
    Redox
    Solaris
    WASI
    Windows
Show all
  • aarch64-darwin
  • aarch64-freebsd
  • aarch64-genode
  • aarch64-linux
  • aarch64-netbsd
  • aarch64-none
  • aarch64-windows
  • aarch64_be-none
  • arm-none
  • armv5tel-linux
  • armv6l-linux
  • armv6l-netbsd
  • armv6l-none
  • armv7a-linux
  • armv7a-netbsd
  • armv7l-linux
  • armv7l-netbsd
  • avr-none
  • i686-cygwin
  • i686-freebsd
  • i686-genode
  • i686-linux
  • i686-netbsd
  • i686-none
  • i686-openbsd
  • i686-windows
  • javascript-ghcjs
  • loongarch64-linux
  • m68k-linux
  • m68k-netbsd
  • m68k-none
  • microblaze-linux
  • microblaze-none
  • microblazeel-linux
  • microblazeel-none
  • mips-linux
  • mips-none
  • mips64-linux
  • mips64-none
  • mips64el-linux
  • mipsel-linux
  • mipsel-netbsd
  • mmix-mmixware
  • msp430-none
  • or1k-none
  • powerpc-netbsd
  • powerpc-none
  • powerpc64-linux
  • powerpc64le-linux
  • powerpcle-none
  • riscv32-linux
  • riscv32-netbsd
  • riscv32-none
  • riscv64-linux
  • riscv64-netbsd
  • riscv64-none
  • rx-none
  • s390-linux
  • s390-none
  • s390x-linux
  • s390x-none
  • vc4-none
  • wasm32-wasi
  • wasm64-wasi
  • x86_64-cygwin
  • x86_64-darwin
  • x86_64-freebsd
  • x86_64-genode
  • x86_64-linux
  • x86_64-netbsd
  • x86_64-none
  • x86_64-openbsd
  • x86_64-redox
  • x86_64-solaris
  • x86_64-windows