MyNixOS website logo
Description

Useful Functions for Data Processing.

In ancient Chinese mythology, Bai Ze is a divine creature that knows the needs of everything. 'baizer' provides data processing functions frequently used by the author. Hope this package also knows what you want!

baizer

Codecov testcoverage R-CMD-check

  • In ancient Chinese mythology, Bai Ze is a divine creature that knows the needs of everything.
  • baizer provides data processing functions frequently used by the author.
  • Hope this package also knows what you want!

installation

universal installation

You can install the stable version of baizer like so:

install.packages("baizer")

Or install the development version of baizer like so:

devtools::install_github("william-swl/baizer")

specific installation

If you prefer Macports on MacOS:

sudo port install R-baizer

basic utils

  • load packages as a batch
pkglib(dplyr, purrr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
  • versions of packages
pkgver(dplyr, purrr)
#> $dplyr
#> [1] "1.1.1"
#> 
#> $purrr
#> [1] "1.0.1"

# case-insensitive input
pkgver(DplyR)
#> $dplyr
#> [1] "1.1.1"
  • information of packages
# pkginfo(dplyr)

# case-insensitive input
# pkginfo(DplyR)
  • use %nin% to get ‘not in’ logical value
1 %nin% c(1, 2, 3)
#> [1] FALSE

1 %nin% c(2, 3)
#> [1] TRUE
  • use %neq% to get NA supported ‘not equal’ logical value
NA != 0
#> [1] NA

NA != NA
#> [1] NA

NA %neq% 0
#> [1] TRUE

NA %neq% NA
#> [1] FALSE
  • not NA/NULL
not.na(NA)
#> [1] FALSE

not.null(NULL)
#> [1] FALSE
  • dump a vector into string
collapse_vector(c("A" = 2, "B" = 3, "C" = 4), front_name = TRUE, collapse = ";")
#> [1] "A(2);B(3);C(4)"

collapse_vector(c("A" = 2, "B" = 3, "C" = 4), front_name = FALSE, collapse = ",")
#> [1] "2(A),3(B),4(C)"
  • slice character vector
x <- c("A", "B", "C", "D", "E")
slice_char(x, "A", "D")
#> [1] "A" "B" "C" "D"
slice_char(x, "D", "A")
#> [1] "D" "C" "B" "A"

x <- c("A", "B", "C", "C", "A", "D", "D", "E", "A")
slice_char(x, "B", "E")
#> [1] "B" "C" "C" "A" "D" "D" "E"
# duplicated element as boundary will throw an error
# slice_char(x, 'A', 'E')
# unique=TRUE to remove the duplicated boundary characters
slice_char(x, "A", "E", unique = TRUE)
#> [1] "A" "B" "C" "C" "D" "D" "E"
  • the index of different character
diff_index("AAAA", "ABBA")
#> [[1]]
#> [1] 2 3

# ignore case
diff_index("AAAA", "abba", ignore_case = TRUE)
#> [[1]]
#> [1] 2 3

# only the index of nth different character, NA if unaccessible
diff_index("AAAA", "ABBA", nth = 2)
#> [[1]]
#> [1] 3

diff_index("AAAA", "ABBA", 10)
#> [[1]]
#> [1] NA

# second and third indices
diff_index("AAAA", "ABBB", nth = 2:3)
#> [[1]]
#> [1] 3 4

# support vectorized operations
diff_index(c("ABBA", "AABB"), "AAAA")
#> [[1]]
#> [1] 2 3
#> 
#> [[2]]
#> [1] 3 4
  • the index of same character
# just like diff_index
same_index(c("ABBA", "AABB"), "AAAA")
#> [[1]]
#> [1] 1 4
#> 
#> [[2]]
#> [1] 1 2
  • fetch character from strings
fetch_char(rep("ABC", 3), list(1, 2, 3))
#> [[1]]
#> [1] "A"
#> 
#> [[2]]
#> [1] "B"
#> 
#> [[3]]
#> [1] "C"

# accept the output of `diff_index` or `same_index`
str1 <- c("ABCD", "AAEF")
str2 <- c("AAAA", "AAAA")
fetch_char(str1, diff_index(str1, str2))
#> [[1]]
#> [1] "B" "C" "D"
#> 
#> [[2]]
#> [1] "E" "F"

# if the output of `diff_index` have NA, also return NA
fetch_char(str1, diff_index(str1, str2, nth = 1:3), na.rm = FALSE)
#> [[1]]
#> [1] "B" "C" "D"
#> 
#> [[2]]
#> [1] "E" "F" NA

# remove NA
fetch_char(str1, diff_index(str1, str2, nth = 1:5), na.rm = TRUE)
#> [[1]]
#> [1] "B" "C" "D"
#> 
#> [[2]]
#> [1] "E" "F"

# collapse the characters from a same string
fetch_char(str1, diff_index(str1, str2, nth = 1:5), na.rm = TRUE, collapse = ",")
#> [[1]]
#> [1] "B,C,D"
#> 
#> [[2]]
#> [1] "E,F"
  • trans fixed string into regular expression string
fix_to_regex("ABC|?(*)")
#> [1] "ABC\\|\\?\\(\\*\\)"
  • detect possible duplication in a vector, ignore case, blank and special character
detect_dup(c("a", "B", "C_", "c -", "#A"))
#> [1] "a"   "#A"  "C_"  "c -"
  • extract key and values for a character vector
extract_kv(c("x: 1", "y: 2"))
#>   x   y 
#> "1" "2"
  • farthest point sampling (FPS) for a vector
fps_vector(1:10, 2)
#> [1]  1 10

fps_vector(1:10, 4)
#> [1]  1  4  7 10

fps_vector(c(1, 2, NULL), 2)
#> [1] 1 2

fps_vector(c(1, 2, NA), 2)
#> [1]  1 NA
  • regex match
v <- stringr::str_c("id", 1:3, c("A", "B", "C"))
v
#> [1] "id1A" "id2B" "id3C"

# return first group as default
reg_match(v, "id(\\d+)(\\w)")
#> [1] "1" "2" "3"

reg_match(v, "id(\\d+)(\\w)", group = 2)
#> [1] "A" "B" "C"

# when group=-1, return full matched tibble
reg_match(v, "id(\\d+)(\\w)", group = -1)
#> # A tibble: 3 × 3
#>   match group1 group2
#>   <chr> <chr>  <chr> 
#> 1 id1A  1      A     
#> 2 id2B  2      B     
#> 3 id3C  3      C
  • join the matched parts into string
reg_join(c("A_12.B", "C_3.23:2"), "[A-Za-z]+")
#> [1] "AB" "C"

reg_join(c("A_12.B", "C_3.23:2"), "\\w+")
#> [1] "A_12B"  "C_3232"

reg_join(c("A_12.B", "C_3.23:2"), "\\d+", sep = ",")
#> [1] "12"     "3,23,2"

reg_join(c("A_12.B", "C_3.23:2"), "\\d", sep = ",")
#> [1] "1,2"     "3,2,3,2"
  • split vector into list
split_vector(1:10, c(3, 7))
#> [[1]]
#> [1] 1 2 3
#> 
#> [[2]]
#> [1] 4 5 6 7
#> 
#> [[3]]
#> [1]  8  9 10


vec <- stringr::str_split("ABCDEFGHIJ", "") %>% unlist()
vec
#>  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J"

split_vector(vec, breaks = c(3, 7), bounds = "[)")
#> [[1]]
#> [1] "A" "B"
#> 
#> [[2]]
#> [1] "C" "D" "E" "F"
#> 
#> [[3]]
#> [1] "G" "H" "I" "J"
  • group character vector by a regex pattern
v <- c(
  stringr::str_c("A", c(1, 2, 9, 10, 11, 12, 99, 101, 102)),
  stringr::str_c("B", c(1, 2, 9, 10, 21, 32, 99, 101, 102))
) %>% sample()
v
#>  [1] "B2"   "A10"  "A99"  "A9"   "A2"   "B102" "B1"   "B101" "A101" "A1"  
#> [11] "B10"  "B9"   "A11"  "B21"  "B32"  "A12"  "A102" "B99"

group_vector(v)
#> $A
#> [1] "A10"  "A99"  "A9"   "A2"   "A101" "A1"   "A11"  "A12"  "A102"
#> 
#> $B
#> [1] "B2"   "B102" "B1"   "B101" "B10"  "B9"   "B21"  "B32"  "B99"

group_vector(v, pattern = "\\w\\d")
#> $A1
#> [1] "A10"  "A101" "A1"   "A11"  "A12"  "A102"
#> 
#> $A2
#> [1] "A2"
#> 
#> $A9
#> [1] "A99" "A9" 
#> 
#> $B1
#> [1] "B102" "B1"   "B101" "B10" 
#> 
#> $B2
#> [1] "B2"  "B21"
#> 
#> $B3
#> [1] "B32"
#> 
#> $B9
#> [1] "B9"  "B99"

# the pattern rules are just same as reg_match()
group_vector(v, pattern = "\\w(\\d)")
#> $`1`
#>  [1] "A10"  "B102" "B1"   "B101" "A101" "A1"   "B10"  "A11"  "A12"  "A102"
#> 
#> $`2`
#> [1] "B2"  "A2"  "B21"
#> 
#> $`3`
#> [1] "B32"
#> 
#> $`9`
#> [1] "A99" "A9"  "B9"  "B99"

# unmatched part will alse be stored
group_vector(v, pattern = "\\d{2}")
#> $`10`
#> [1] "A10"  "B102" "B101" "A101" "B10"  "A102"
#> 
#> $`11`
#> [1] "A11"
#> 
#> $`12`
#> [1] "A12"
#> 
#> $`21`
#> [1] "B21"
#> 
#> $`32`
#> [1] "B32"
#> 
#> $`99`
#> [1] "A99" "B99"
#> 
#> $unmatch
#> [1] "B2" "A9" "A2" "B1" "A1" "B9"
  • sort by a function
sortf(c(-2, 1, 3), abs)
#> [1]  1 -2  3

v <- stringr::str_c("id", c(1, 2, 9, 10, 11, 12, 99, 101, 102)) %>% sample()
v
#> [1] "id10"  "id99"  "id1"   "id12"  "id101" "id102" "id9"   "id2"   "id11"

sortf(v, function(x) reg_match(x, "\\d+") %>% as.double())
#> [1] "id1"   "id2"   "id9"   "id10"  "id11"  "id12"  "id99"  "id101" "id102"

# you can also use purrr functions
sortf(v, ~ reg_match(.x, "\\d+") %>% as.double())
#> [1] "id1"   "id2"   "id9"   "id10"  "id11"  "id12"  "id99"  "id101" "id102"


# group before sort
v <- c(
  stringr::str_c("A", c(1, 2, 9, 10, 11, 12, 99, 101, 102)),
  stringr::str_c("B", c(1, 2, 9, 10, 21, 32, 99, 101, 102))
) %>% sample()
v
#>  [1] "A2"   "B101" "A99"  "A102" "A1"   "B2"   "A10"  "B102" "A11"  "A101"
#> [11] "B9"   "A12"  "B10"  "B1"   "B32"  "B21"  "A9"   "B99"

sortf(v, ~ reg_match(.x, "\\d+") %>% as.double(), group_pattern = "\\w")
#>  [1] "A1"   "A2"   "A9"   "A10"  "A11"  "A12"  "A99"  "A101" "A102" "B1"  
#> [11] "B2"   "B9"   "B10"  "B21"  "B32"  "B99"  "B101" "B102"
  • pileup another logical vector on the TRUE values of first vector
# first vector have 2 TRUE value
v1 <- c(TRUE, FALSE, TRUE)

# the length of second vector should also be 2
v2 <- c(FALSE, TRUE)

pileup_logical(v1, v2)
#> [1] FALSE FALSE  TRUE
  • only keep unique vector values and its names
v <- c(a = 1, b = 2, c = 3, b = 2, a = 1)

# unique will lost the names
unique(v)
#> [1] 1 2 3

# uniq can keep them
uniq(v)
#> a b c 
#> 1 2 3
  • replace the items of one object by another
x <- list(A = 1, B = 3)
y <- list(A = 9, C = 10)

replace_item(x, y)
#> $A
#> [1] 9
#> 
#> $B
#> [1] 3

replace_item(x, y, keep_extra = TRUE)
#> $A
#> [1] 9
#> 
#> $B
#> [1] 3
#> 
#> $C
#> [1] 10

x <- list(a = 1, b = list(c = "a", d = FALSE, f = list(x = 0, z = 30)))
y <- list(a = 3, e = 2, b = list(d = TRUE, f = list(x = 10, y = 20)))

replace_item(x, y, keep_extra = TRUE)
#> $a
#> [1] 3
#> 
#> $b
#> $b$c
#> [1] "a"
#> 
#> $b$d
#> [1] TRUE
#> 
#> $b$f
#> $b$f$x
#> [1] 10
#> 
#> $b$f$z
#> [1] 30
#> 
#> $b$f$y
#> [1] 20
#> 
#> 
#> 
#> $e
#> [1] 2
  • generate characters
gen_char(from = "g", n = 5)
#> [1] "g" "h" "i" "j" "k"

gen_char(to = "g", n = 5)
#> [1] "c" "d" "e" "f" "g"

gen_char(from = "g", to = "j")
#> [1] "g" "h" "i" "j"

gen_char(from = "t", n = 5, random = TRUE)
#> [1] "z" "y" "t" "u" "y"

gen_char(from = "x", n = 5, random = TRUE, allow_dup = FALSE, add = c("+", "-"))
#> [1] "y" "z" "-" "+" "x"
  • trans range character into seq characters
rng2seq(c("1-5", "2"))
#> [[1]]
#> [1] "1" "2" "3" "4" "5"
#> 
#> [[2]]
#> [1] "2"
  • return top n items with highest frequency
top_item(c("a", "b", "c", "b"))
#> [1] "b"

top_item(c(1, 2, 3, 2, 2))
#> [1] 2
  • melt a vector into single value
melt_vector(c(NA, 2, 3), method = "first")
#> [1] 2

melt_vector(c(NA, 2, 3), method = "sum")
#> [1] 5

melt_vector(c(NA, 2, 3), method = ",")
#> [1] "2,3"

melt_vector(c(NA, 2, Inf), invalid = c(NA, Inf))
#> [1] 2
  • combine multiple vectors into one
x1 <- c(1, 2, NA, NA)
x2 <- c(3, NA, 2, NA)
x3 <- c(4, NA, NA, 3)

combn_vector(x1, x2, x3, method = "sum")
#> [1] 8 2 2 3
  • broadcast the vector into length n
broadcast_vector(1:3, 5)
#> [1] 1 2 3 1 2
  • replace specific characters in a string by their locations
str_replace_loc("abcde", 1, 3, "A")
#> [1] "Ade"
  • swap the names and values of a vector
v <- c("a" = "A", "b" = "B", "c" = "C")
swap_vecname(v)
#>   A   B   C 
#> "a" "b" "c"

numbers

  • from float number to fixed digits character
round(2.1951, 2)
#> [1] 2.2

round_string(2.1951, 2)
#> [1] "2.20"

signif(2.1951, 3)
#> [1] 2.2

signif_string(2.1951, 3)
#> [1] "2.20"
  • signif or round string depend on the character length
signif_round_string(20.526, 2, "short")
#> [1] "21"
signif_round_string(20.526, 2, "long")
#> [1] "20.53"

# if you want keep the very small value
signif_round_string(0.000002654, 3, full_small = TRUE)
#> [1] "0.00000265"
  • signif while use floor/ceiling
signif_floor(3.19, 2)
#> [1] 3.1

signif_ceiling(3.11, 2)
#> [1] 3.2
  • whether the number string only has zero
is.zero("0.000")
#> [1] TRUE

is.zero("0.0001")
#> [1] FALSE
  • float and percent trans
float_to_percent(0.123, digits = 1)
#> [1] "12.3%"

percent_to_float("123%", digits = 3)
#> [1] "1.230"

percent_to_float("123%", digits = 3, to_double = TRUE)
#> [1] 1.23
  • wrapper of the functions to process number string with prefix and suffix
number_fun_wrapper(">=2.134%", function(x) round(x, 2))
#> [1] ">=2.13%"
  • expand a number vector according to the adjacent two numbers
adjacent_div(10^c(1:3), n_div = 10)
#>  [1]   10   20   30   40   50   60   70   80   90  100  100  200  300  400  500
#> [16]  600  700  800  900 1000

# only keep the unique numbers
adjacent_div(10^c(1:3), n_div = 10, .unique = TRUE)
#>  [1]   10   20   30   40   50   60   70   80   90  100  200  300  400  500  600
#> [16]  700  800  900 1000
  • correct the numbers to a target ratio
correct_ratio(c(10, 10), c(3, 5))
#> [1]  6 10

# support ratio as a float
correct_ratio(c(100, 100), c(0.2, 0.8))
#> [1]  25 100

# more numbers
correct_ratio(10:13, c(2, 3, 4, 6))
#> [1]  4  6  9 13

# with digits after decimal point
correct_ratio(c(10, 10), c(1, 4), digits = 1)
#> [1]  2.5 10.0
  • the ticks near a number
near_ticks(3462, level = 10)
#> [1] 3460 3465 3470
  • the nearest ticks around a number
nearest_tick(3462, level = 10)
#> [1] 3460
  • generate ticks for a number vector
generate_ticks(c(176, 198, 264))
#>  [1] 175 185 195 205 215 225 235 245 255 265
  • split a positive integer number as a number vector
pos_int_split(12, 3, method = "average")
#> [1] 4 4 4

pos_int_split(12, 3, method = "random")
#> [1] 6 1 5

# you can also assign the ratio of output
pos_int_split(12, 3, method = c(1, 2, 3))
#> [1] 2 4 6
  • generate outliers from a series of number
x <- seq(0, 100, 1)

gen_outlier(x, 10)
#>  [1] -104 -112 -115 -145 -179  219  253  210  263  189

# generation limits
gen_outlier(x, 10, lim = c(-80, 160))
#>  [1] -64 -68 -60 -75 -66 157 153 154 158 159

# assign the low and high outliers
gen_outlier(x, 10, lim = c(-80, 160), assign_n = c(0.1, 0.9))
#>  [1] -70 156 159 156 153 155 151 156 151 157

# just generate low outliers
gen_outlier(x, 10, side = "low")
#>  [1] -190 -164 -196 -161 -105 -144 -149 -110 -102  -89

# return with raw vector
gen_outlier(x, 10, only_out = FALSE)
#>   [1] -143 -122  -61  -76  -54  161  199  276  251  195    0    1    2    3    4
#>  [16]    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19
#>  [31]   20   21   22   23   24   25   26   27   28   29   30   31   32   33   34
#>  [46]   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49
#>  [61]   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64
#>  [76]   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79
#>  [91]   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94
#> [106]   95   96   97   98   99  100
  • max-min normalization
mm_norm(c(1, 3, 4))
#> [1] 0.0000000 0.6666667 1.0000000

dataframe

  • a minimal dataset
head(mini_diamond)
#> # A tibble: 6 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-1   1.02 Fair  SI1      3027  6.25  6.18
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18
#> 3 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 4 id-4   1.54 Ideal SI2      9452  7.43  7.45
#> 5 id-5   0.72 Ideal VS1      2498  5.73  5.77
#> 6 id-6   2.02 Fair  SI2     14080  8.33  8.37
  • shortcut of dplyr::column_to_rownames and dplyr::rownames_to_column
head(mini_diamond) %>% c2r("id")
#>      carat   cut clarity price    x    y
#> id-1  1.02  Fair     SI1  3027 6.25 6.18
#> id-2  1.51  Good     VS2 11746 7.27 7.18
#> id-3  0.52 Ideal    VVS1  2029 5.15 5.18
#> id-4  1.54 Ideal     SI2  9452 7.43 7.45
#> id-5  0.72 Ideal     VS1  2498 5.73 5.77
#> id-6  2.02  Fair     SI2 14080 8.33 8.37

# use column index
head(mini_diamond) %>% c2r(1)
#>      carat   cut clarity price    x    y
#> id-1  1.02  Fair     SI1  3027 6.25 6.18
#> id-2  1.51  Good     VS2 11746 7.27 7.18
#> id-3  0.52 Ideal    VVS1  2029 5.15 5.18
#> id-4  1.54 Ideal     SI2  9452 7.43 7.45
#> id-5  0.72 Ideal     VS1  2498 5.73 5.77
#> id-6  2.02  Fair     SI2 14080 8.33 8.37

head(mini_diamond) %>%
  c2r("id") %>%
  r2c("id")
#> # A tibble: 6 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-1   1.02 Fair  SI1      3027  6.25  6.18
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18
#> 3 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 4 id-4   1.54 Ideal SI2      9452  7.43  7.45
#> 5 id-5   0.72 Ideal VS1      2498  5.73  5.77
#> 6 id-6   2.02 Fair  SI2     14080  8.33  8.37
  • fancy count to show an extended column

# count one column
fancy_count(mini_diamond, cut)
#> # A tibble: 3 × 3
#>   cut       n     r
#>   <chr> <int> <dbl>
#> 1 Fair     35  0.35
#> 2 Good     31  0.31
#> 3 Ideal    34  0.34

# count an extended column
fancy_count(mini_diamond, cut, ext = clarity)
#> # A tibble: 3 × 4
#>   cut       n     r clarity                                                
#>   <chr> <int> <dbl> <chr>                                                  
#> 1 Fair     35  0.35 I1(5),IF(4),SI1(5),SI2(4),VS1(3),VS2(5),VVS1(5),VVS2(4)
#> 2 Good     31  0.31 I1(5),IF(5),SI1(4),SI2(4),VS1(2),VS2(4),VVS1(4),VVS2(3)
#> 3 Ideal    34  0.34 I1(4),IF(4),SI1(5),SI2(4),VS1(5),VS2(2),VVS1(5),VVS2(5)

# change format
fancy_count(mini_diamond, cut, ext = clarity, ext_fmt = "ratio")
#> # A tibble: 3 × 4
#>   cut       n     r clarity                                                     
#>   <chr> <int> <dbl> <chr>                                                       
#> 1 Fair     35  0.35 I1(0.14),IF(0.11),SI1(0.14),SI2(0.11),VS1(0.09),VS2(0.14),V…
#> 2 Good     31  0.31 I1(0.16),IF(0.16),SI1(0.13),SI2(0.13),VS1(0.06),VS2(0.13),V…
#> 3 Ideal    34  0.34 I1(0.12),IF(0.12),SI1(0.15),SI2(0.12),VS1(0.15),VS2(0.06),V…

fancy_count(mini_diamond, cut, ext = clarity, ext_fmt = "clean")
#> # A tibble: 3 × 4
#>   cut       n     r clarity                        
#>   <chr> <int> <dbl> <chr>                          
#> 1 Fair     35  0.35 I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
#> 2 Good     31  0.31 I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
#> 3 Ideal    34  0.34 I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2

# count an extended column, in an order by n
fancy_count(mini_diamond, cut, ext = clarity, sort = TRUE)
#> # A tibble: 3 × 4
#>   cut       n     r clarity                                                
#>   <chr> <int> <dbl> <chr>                                                  
#> 1 Fair     35  0.35 I1(5),SI1(5),VS2(5),VVS1(5),IF(4),SI2(4),VVS2(4),VS1(3)
#> 2 Ideal    34  0.34 SI1(5),VS1(5),VVS1(5),VVS2(5),I1(4),IF(4),SI2(4),VS2(2)
#> 3 Good     31  0.31 I1(5),IF(5),SI1(4),SI2(4),VS2(4),VVS1(4),VVS2(3),VS1(2)

# extended column after a two-column count
fancy_count(mini_diamond, cut, clarity, ext = id) %>% head(5)
#> # A tibble: 5 × 5
#>   cut   clarity     n     r id                                          
#>   <chr> <chr>   <int> <dbl> <chr>                                       
#> 1 Fair  I1          5  0.05 id-20(1),id-23(1),id-28(1),id-32(1),id-48(1)
#> 2 Fair  IF          4  0.04 id-12(1),id-45(1),id-89(1),id-95(1)         
#> 3 Fair  SI1         5  0.05 id-1(1),id-64(1),id-65(1),id-68(1),id-76(1) 
#> 4 Fair  SI2         4  0.04 id-25(1),id-40(1),id-6(1),id-99(1)          
#> 5 Fair  VS1         3  0.03 id-36(1),id-43(1),id-85(1)
  • count two columns as a cross-tabulation table
cross_count(mini_diamond, cut, clarity)
#>       I1 IF SI1 SI2 VS1 VS2 VVS1 VVS2
#> Fair   5  4   5   4   3   5    5    4
#> Good   5  5   4   4   2   4    4    3
#> Ideal  4  4   5   4   5   2    5    5

# show the ratio in the row
cross_count(mini_diamond, cut, clarity, method = "rowr")
#>         I1   IF  SI1  SI2  VS1  VS2 VVS1 VVS2
#> Fair  0.14 0.11 0.14 0.11 0.09 0.14 0.14 0.11
#> Good  0.16 0.16 0.13 0.13 0.06 0.13 0.13 0.10
#> Ideal 0.12 0.12 0.15 0.12 0.15 0.06 0.15 0.15

# show the ratio in the col
cross_count(mini_diamond, cut, clarity, method = "colr")
#>         I1   IF  SI1  SI2 VS1  VS2 VVS1 VVS2
#> Fair  0.36 0.31 0.36 0.33 0.3 0.45 0.36 0.33
#> Good  0.36 0.38 0.29 0.33 0.2 0.36 0.29 0.25
#> Ideal 0.29 0.31 0.36 0.33 0.5 0.18 0.36 0.42
  • split a column and return a longer tibble
df <- fancy_count(mini_diamond, cut, ext = clarity)
head(df)
#> # A tibble: 3 × 4
#>   cut       n     r clarity                                                
#>   <chr> <int> <dbl> <chr>                                                  
#> 1 Fair     35  0.35 I1(5),IF(4),SI1(5),SI2(4),VS1(3),VS2(5),VVS1(5),VVS2(4)
#> 2 Good     31  0.31 I1(5),IF(5),SI1(4),SI2(4),VS1(2),VS2(4),VVS1(4),VVS2(3)
#> 3 Ideal    34  0.34 I1(4),IF(4),SI1(5),SI2(4),VS1(5),VS2(2),VVS1(5),VVS2(5)

split_column(df, name_col = cut, value_col = clarity)
#> # A tibble: 24 × 2
#>    cut   clarity
#>    <chr> <chr>  
#>  1 Fair  I1(5)  
#>  2 Fair  IF(4)  
#>  3 Fair  SI1(5) 
#>  4 Fair  SI2(4) 
#>  5 Fair  VS1(3) 
#>  6 Fair  VS2(5) 
#>  7 Fair  VVS1(5)
#>  8 Fair  VVS2(4)
#>  9 Good  I1(5)  
#> 10 Good  IF(5)  
#> # … with 14 more rows
  • move selected rows to target location
# move row 3-5 after row 8
move_row(mini_diamond, 3:5, .after = 8)
#> # A tibble: 100 × 7
#>    id    carat cut   clarity price     x     y
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#>  1 id-1   1.02 Fair  SI1      3027  6.25  6.18
#>  2 id-2   1.51 Good  VS2     11746  7.27  7.18
#>  3 id-6   2.02 Fair  SI2     14080  8.33  8.37
#>  4 id-7   0.27 Good  VVS1      752  4.1   4.07
#>  5 id-8   0.51 Good  SI2      1029  5.05  5.08
#>  6 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#>  7 id-4   1.54 Ideal SI2      9452  7.43  7.45
#>  8 id-5   0.72 Ideal VS1      2498  5.73  5.77
#>  9 id-9   1.01 Ideal SI1      5590  6.43  6.4 
#> 10 id-10  0.7  Fair  VVS1     1691  5.56  5.41
#> # … with 90 more rows

# move row 3-5 before the first row
move_row(mini_diamond, 3:5, .before = TRUE)
#> # A tibble: 100 × 7
#>    id    carat cut   clarity price     x     y
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#>  1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#>  2 id-4   1.54 Ideal SI2      9452  7.43  7.45
#>  3 id-5   0.72 Ideal VS1      2498  5.73  5.77
#>  4 id-1   1.02 Fair  SI1      3027  6.25  6.18
#>  5 id-2   1.51 Good  VS2     11746  7.27  7.18
#>  6 id-6   2.02 Fair  SI2     14080  8.33  8.37
#>  7 id-7   0.27 Good  VVS1      752  4.1   4.07
#>  8 id-8   0.51 Good  SI2      1029  5.05  5.08
#>  9 id-9   1.01 Ideal SI1      5590  6.43  6.4 
#> 10 id-10  0.7  Fair  VVS1     1691  5.56  5.41
#> # … with 90 more rows

# move row 3-5 after the last row
move_row(mini_diamond, 3:5, .after = TRUE)
#> # A tibble: 100 × 7
#>    id    carat cut   clarity price     x     y
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#>  1 id-1   1.02 Fair  SI1      3027  6.25  6.18
#>  2 id-2   1.51 Good  VS2     11746  7.27  7.18
#>  3 id-6   2.02 Fair  SI2     14080  8.33  8.37
#>  4 id-7   0.27 Good  VVS1      752  4.1   4.07
#>  5 id-8   0.51 Good  SI2      1029  5.05  5.08
#>  6 id-9   1.01 Ideal SI1      5590  6.43  6.4 
#>  7 id-10  0.7  Fair  VVS1     1691  5.56  5.41
#>  8 id-11  1.02 Good  VVS1     7861  6.37  6.4 
#>  9 id-12  0.71 Fair  IF       3205  5.87  5.81
#> 10 id-13  0.56 Ideal SI1      1633  5.31  5.32
#> # … with 90 more rows
  • slice a tibble by an ordered vector
ordered_slice(mini_diamond, id, c("id-3", "id-2"))
#> # A tibble: 2 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18

# support NA and known values in ordered vector
ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", "id-3", NA))
#> Warning in ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", : 2
#> NA values!
#> Warning in ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", : 2
#> duplicated values!
#> # A tibble: 5 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18
#> 3 <NA>  NA    <NA>  <NA>       NA NA    NA   
#> 4 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 5 <NA>  NA    <NA>  <NA>       NA NA    NA

# remove NA
ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", "id-3", NA),
  na.rm = TRUE
)
#> Warning in ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", : 2
#> NA values!
#> Warning in ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", : 2
#> duplicated values!
#> # A tibble: 3 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18
#> 3 id-3   0.52 Ideal VVS1     2029  5.15  5.18

# remove duplication
ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", "id-3", NA),
  dup.rm = TRUE
)
#> Warning in ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", : 2
#> NA values!
#> Warning in ordered_slice(mini_diamond, id, c("id-3", "id-2", "unknown_id", : 2
#> duplicated values!
#> # A tibble: 3 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18
#> 3 <NA>  NA    <NA>  <NA>       NA NA    NA
  • remove columns by the ratio of NA, default to remove the columns only have NA
df_with_nacol <- dplyr::bind_cols(
  mini_diamond,
  tibble::tibble(na1 = NA, na2 = NA)
)
df_with_nacol
#> # A tibble: 100 × 9
#>    id    carat cut   clarity price     x     y na1   na2  
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl> <lgl> <lgl>
#>  1 id-1   1.02 Fair  SI1      3027  6.25  6.18 NA    NA   
#>  2 id-2   1.51 Good  VS2     11746  7.27  7.18 NA    NA   
#>  3 id-3   0.52 Ideal VVS1     2029  5.15  5.18 NA    NA   
#>  4 id-4   1.54 Ideal SI2      9452  7.43  7.45 NA    NA   
#>  5 id-5   0.72 Ideal VS1      2498  5.73  5.77 NA    NA   
#>  6 id-6   2.02 Fair  SI2     14080  8.33  8.37 NA    NA   
#>  7 id-7   0.27 Good  VVS1      752  4.1   4.07 NA    NA   
#>  8 id-8   0.51 Good  SI2      1029  5.05  5.08 NA    NA   
#>  9 id-9   1.01 Ideal SI1      5590  6.43  6.4  NA    NA   
#> 10 id-10  0.7  Fair  VVS1     1691  5.56  5.41 NA    NA   
#> # … with 90 more rows

remove_nacol(df_with_nacol)
#> # A tibble: 100 × 7
#>    id    carat cut   clarity price     x     y
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#>  1 id-1   1.02 Fair  SI1      3027  6.25  6.18
#>  2 id-2   1.51 Good  VS2     11746  7.27  7.18
#>  3 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#>  4 id-4   1.54 Ideal SI2      9452  7.43  7.45
#>  5 id-5   0.72 Ideal VS1      2498  5.73  5.77
#>  6 id-6   2.02 Fair  SI2     14080  8.33  8.37
#>  7 id-7   0.27 Good  VVS1      752  4.1   4.07
#>  8 id-8   0.51 Good  SI2      1029  5.05  5.08
#>  9 id-9   1.01 Ideal SI1      5590  6.43  6.4 
#> 10 id-10  0.7  Fair  VVS1     1691  5.56  5.41
#> # … with 90 more rows

# only keep the columns that have less than 20% NA values
# remove_nacol(df_with_nacol, max_ratio=0.2)
  • remove rows by the ratio of NA
# remove_narow(df)
  • remove columns by the ratio of an identical single value (NA supported)
df_with_monocol <- tibble::tibble(
  x = c(1, 1, 1, 2),
  y = c(1, 1, 2, 2),
  z = c(1, 1, 1, 1),
  x1 = c(1, 1, 1, NA),
  y1 = c(1, 1, NA, NA),
  z1 = c(NA, NA, NA, NA),
  x2 = c(NA, NA, NA, 1),
  y2 = c(NA, NA, 1, 1)
)

df_with_monocol
#> # A tibble: 4 × 8
#>       x     y     z    x1    y1 z1       x2    y2
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl>
#> 1     1     1     1     1     1 NA       NA    NA
#> 2     1     1     1     1     1 NA       NA    NA
#> 3     1     2     1     1    NA NA       NA     1
#> 4     2     2     1    NA    NA NA        1     1

remove_monocol(df_with_monocol)
#> # A tibble: 4 × 6
#>       x     y    x1    y1    x2    y2
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     1     1     1    NA    NA
#> 2     1     1     1     1    NA    NA
#> 3     1     2     1    NA    NA     1
#> 4     2     2    NA    NA     1     1

# only keep the columns that have less than 60% identical values
remove_monocol(df_with_monocol, max_ratio = 0.6)
#> # A tibble: 4 × 3
#>       y    y1    y2
#>   <dbl> <dbl> <dbl>
#> 1     1     1    NA
#> 2     1     1    NA
#> 3     2    NA     1
#> 4     2    NA     1
  • separate numeric vector into bins
vector <- dplyr::pull(mini_diamond, price, id)

hist_bins(vector)
#> # A tibble: 100 × 5
#>    id    value  start    end   bin
#>    <chr> <int>  <dbl>  <dbl> <int>
#>  1 id-1   3027  2218.  3975.     2
#>  2 id-2  11746 11000. 12757.     7
#>  3 id-3   2029   462   2218.     1
#>  4 id-4   9452  9244  11000.     6
#>  5 id-5   2498  2218.  3975.     2
#>  6 id-6  14080 12757. 14513.     8
#>  7 id-7    752   462   2218.     1
#>  8 id-8   1029   462   2218.     1
#>  9 id-9   5590  3975.  5731.     3
#> 10 id-10  1691   462   2218.     1
#> # … with 90 more rows

# set the max and min limits
hist_bins(vector, bins = 20, lim = c(0, 20000))
#> # A tibble: 100 × 5
#>    id    value start   end   bin
#>    <chr> <int> <dbl> <dbl> <int>
#>  1 id-1   3027  3000  4000     4
#>  2 id-2  11746 11000 12000    12
#>  3 id-3   2029  2000  3000     3
#>  4 id-4   9452  9000 10000    10
#>  5 id-5   2498  2000  3000     3
#>  6 id-6  14080 14000 15000    15
#>  7 id-7    752     0  1000     1
#>  8 id-8   1029  1000  2000     2
#>  9 id-9   5590  5000  6000     6
#> 10 id-10  1691  1000  2000     2
#> # … with 90 more rows

# or pass breaks directly
hist_bins(vector, breaks = seq(0, 20000, length.out = 11))
#> # A tibble: 100 × 5
#>    id    value start   end   bin
#>    <chr> <int> <dbl> <dbl> <int>
#>  1 id-1   3027  2000  4000     2
#>  2 id-2  11746 10000 12000     6
#>  3 id-3   2029  2000  4000     2
#>  4 id-4   9452  8000 10000     5
#>  5 id-5   2498  2000  4000     2
#>  6 id-6  14080 14000 16000     8
#>  7 id-7    752     0  2000     1
#>  8 id-8   1029     0  2000     1
#>  9 id-9   5590  4000  6000     3
#> 10 id-10  1691     0  2000     1
#> # … with 90 more rows
  • trans a table in markdown format into tibble
x <- "
  | col1 | col2 | col3 |
  | ---- | ---- | ---- |
  | v1   | v2   | v3   |
  | r1   | r2   | r3   |
  "

as_tibble_md(x)
#> # A tibble: 2 × 3
#>   col1  col2  col3 
#>   <chr> <chr> <chr>
#> 1 v1    v2    v3   
#> 2 r1    r2    r3
  • trans a tibble into markdown format table
mini_diamond %>%
  head(5) %>%
  as_md_table()
#> | id | carat | cut | clarity | price | x | y |
#> | - | - | - | - | - | - | - |
#> | id-1 | 1.02 | Fair | SI1 |  3027 | 6.25 | 6.18 |
#> | id-2 | 1.51 | Good | VS2 | 11746 | 7.27 | 7.18 |
#> | id-3 | 0.52 | Ideal | VVS1 |  2029 | 5.15 | 5.18 |
#> | id-4 | 1.54 | Ideal | SI2 |  9452 | 7.43 | 7.45 |
#> | id-5 | 0.72 | Ideal | VS1 |  2498 | 5.73 | 5.77 |
  • relevel a target column by another reference column
cut_level <- mini_diamond %>%
  pull(cut) %>%
  unique()
df <- mini_diamond %>%
  dplyr::mutate(cut = factor(cut, cut_level)) %>%
  dplyr::mutate(cut0 = stringr::str_c(cut, "xxx"))

levels(df$cut)
#> [1] "Fair"  "Good"  "Ideal"

levels(df$cut0)
#> NULL

# after relevel
df <- ref_level(df, cut0, cut)

levels(df$cut)
#> [1] "Fair"  "Good"  "Ideal"

levels(df$cut0)
#> [1] "Fairxxx"  "Goodxxx"  "Idealxxx"
  • trans list into tibble
x <- list(
  c("a", "1"),
  c("b", "2"),
  c("c", "3")
)

list2df(x, colnames = c("char", "num"))
#>     char num
#> It1    a   1
#> It2    b   2
#> It3    c   3


x <- list(
  c("a", "b", "c"),
  c("1", "2", "3")
)

list2df(x, method = "col")
#>   It1 It2
#> 1   a   1
#> 2   b   2
#> 3   c   3
  • generate a matrix to show whether the item in each element of a list
x <- 1:5 %>% map(~ gen_char(to = "k", n = 5, random = TRUE, seed = .x))

x
#> [[1]]
#> [1] "i" "d" "g" "a" "b"
#> 
#> [[2]]
#> [1] "e" "f" "f" "h" "a"
#> 
#> [[3]]
#> [1] "e" "j" "g" "d" "j"
#> 
#> [[4]]
#> [1] "h" "k" "c" "c" "g"
#> 
#> [[5]]
#> [1] "b" "k" "i" "k" "i"

exist_matrix(x)
#> # A tibble: 5 × 11
#>   g     i     k     a     b     c     d     e     f     h     j    
#> * <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 TRUE  TRUE  FALSE TRUE  TRUE  FALSE TRUE  FALSE FALSE FALSE FALSE
#> 2 FALSE FALSE FALSE TRUE  FALSE FALSE FALSE TRUE  TRUE  TRUE  FALSE
#> 3 TRUE  FALSE FALSE FALSE FALSE FALSE TRUE  TRUE  FALSE FALSE TRUE 
#> 4 TRUE  FALSE TRUE  FALSE FALSE TRUE  FALSE FALSE FALSE TRUE  FALSE
#> 5 FALSE TRUE  TRUE  FALSE TRUE  FALSE FALSE FALSE FALSE FALSE FALSE
  • dataframe rows seriation, which will reorder the rows in a better pattern. Sometimes clearer than hclust
seriated_df <- seriate_df(df)
  • diagnosis a tibble for character NA, NULL, all T/F column, blank in cell
x <- tibble(
  c1 = c("NA", NA, "a", "b"),
  c2 = c("c", "d", "e", "NULL"),
  c3 = c("T", "F", "F", "T"),
  c4 = c("T", "F", "F", NA),
  c5 = c("", " ", "\t", "\n")
)
x
#> # A tibble: 4 × 5
#>   c1    c2    c3    c4    c5   
#>   <chr> <chr> <chr> <chr> <chr>
#> 1 NA    c     T     T     ""   
#> 2 <NA>  d     F     F     " "  
#> 3 a     e     F     F     "\t" 
#> 4 b     NULL  T     <NA>  "\n"

dx_tb(x)
#> $chr_na
#> # A tibble: 1 × 2
#>     row   col
#>   <int> <int>
#> 1     1     1
#> 
#> $chr_null
#> # A tibble: 1 × 2
#>     row   col
#>   <int> <int>
#> 1     4     2
#> 
#> $only_tf
#> [1] 3 4
#> 
#> $blank_in_cell
#> [1] " "  "\t" "\n"
#> 
#> $stat
#>        chr_na      chr_null       only_tf blank_in_cell 
#>             1             1             2             3 
#> 
#> $pass
#> [1] FALSE
  • generate tibbles
gen_tb()
#> # A tibble: 3 × 4
#>       V1    V2     V3     V4
#>    <dbl> <dbl>  <dbl>  <dbl>
#> 1 -3.19  0.406 -1.44   0.240
#> 2 -0.196 1.36   0.466 -0.304
#> 3 -0.326 0.262  0.705  1.31

gen_tb(fill = "str", nrow = 3, ncol = 4, len = 3)
#> # A tibble: 3 × 4
#>   V1    V2    V3    V4   
#>   <chr> <chr> <chr> <chr>
#> 1 slt   imb   kou   cha  
#> 2 xce   qbu   dlx   qmr  
#> 3 yhh   xir   fze   egv
  • differences between two tibbles
tb1 <- gen_tb(fill = "int", seed = 1)
tb1
#> # A tibble: 3 × 4
#>      V1    V2    V3    V4
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    -7    15     4    -4
#> 2     1     3     7    15
#> 3    -9    -9     5     3

tb2 <- gen_tb(fill = "int", seed = 3)
tb2
#> # A tibble: 3 × 4
#>      V1    V2    V3    V4
#>   <dbl> <dbl> <dbl> <dbl>
#> 1   -10   -12     0    12
#> 2    -3     1    11    -8
#> 3     2     0   -13   -12

diff_tb(tb1, tb2)
#> # A tibble: 6 × 6
#>   .diff_type .diff        V1    V2    V3    V4
#>   <chr>      <glue>    <dbl> <dbl> <dbl> <dbl>
#> 1 c          -old[1, ]    -7    15     4    -4
#> 2 c          +new[1, ]   -10   -12     0    12
#> 3 c          -old[2, ]     1     3     7    15
#> 4 c          +new[2, ]    -3     1    11    -8
#> 5 c          -old[3, ]    -9    -9     5     3
#> 6 c          +new[3, ]     2     0   -13   -12
  • transpose a dataframe
tdf(c2r(head(mini_diamond), "id"))
#> # A tibble: 6 × 7
#>   item    `id-1`  `id-2` `id-3`  `id-4`  `id-5`  `id-6`
#>   <chr>   <chr>   <chr>  <chr>   <chr>   <chr>   <chr> 
#> 1 carat   "1.02"  1.51   "0.52"  "1.54"  "0.72"  2.02  
#> 2 cut     "Fair"  Good   "Ideal" "Ideal" "Ideal" Fair  
#> 3 clarity "SI1"   VS2    "VVS1"  "SI2"   "VS1"   SI2   
#> 4 price   " 3027" 11746  " 2029" " 9452" " 2498" 14080 
#> 5 x       "6.25"  7.27   "5.15"  "7.43"  "5.73"  8.33  
#> 6 y       "6.18"  7.18   "5.18"  "7.45"  "5.77"  8.37
  • count unique values in each column
uniq_in_cols(mini_diamond)
#> # A tibble: 7 × 2
#>   col     uniqe_values
#>   <chr>   <chr>       
#> 1 id      100         
#> 2 carat   57          
#> 3 cut     3           
#> 4 clarity 8           
#> 5 price   99          
#> 6 x       89          
#> 7 y       87
  • like left_join(), full_join(), inner_join() while ignore the same columns in right tibble
tb1 <- head(mini_diamond, 4)
tb2 <- tibble(
  id = c("id-2", "id-4", "id-5"),
  carat = 1:3,
  price = c(1000, 2000, 3000),
  newcol = c("new2", "new4", "new5")
)

left_expand(tb1, tb2, by = "id")
#> # A tibble: 4 × 8
#>   id    carat cut   clarity price     x     y newcol
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl> <chr> 
#> 1 id-1   1.02 Fair  SI1      3027  6.25  6.18 <NA>  
#> 2 id-2   1.51 Good  VS2     11746  7.27  7.18 new2  
#> 3 id-3   0.52 Ideal VVS1     2029  5.15  5.18 <NA>  
#> 4 id-4   1.54 Ideal SI2      9452  7.43  7.45 new4

inner_expand(tb1, tb2, by = "id")
#> # A tibble: 2 × 8
#>   id    carat cut   clarity price     x     y newcol
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl> <chr> 
#> 1 id-2   1.51 Good  VS2     11746  7.27  7.18 new2  
#> 2 id-4   1.54 Ideal SI2      9452  7.43  7.45 new4
  • rewrite the NA values in a tibble by another tibble
tb1 <- tibble(
  id = c("id-1", "id-2", "id-3", "id-4"),
  group = c("a", "b", "a", "b"),
  price = c(0, -200, 3000, NA),
  type = c("large", "none", "small", "none")
)

tb2 <- tibble(
  id = c("id-1", "id-2", "id-3", "id-4"),
  group = c("a", "b", "a", "b"),
  price = c(1, 2, 3, 4),
  type = c("l", "x", "x", "m")
)

rewrite_na(tb1, tb2, by = c("id", "group"))
#> # A tibble: 4 × 4
#>   id    group price type 
#>   <chr> <chr> <chr> <chr>
#> 1 id-1  a     0     large
#> 2 id-2  b     -200  none 
#> 3 id-3  a     3000  small
#> 4 id-4  b     4     none
  • remove outliers and NA
out <- tibble(
  id = stringr::str_c("out-", 1:20),
  price = gen_outlier(mini_diamond %>% dplyr::pull(price), n = 20)
)

dim(bind_rows(mini_diamond, out))
#> [1] 120   7

res <- bind_rows(mini_diamond, out) %>%
  remove_outliers(price)

dim(res)
#> [1] 93  7

stat

  • generate all combinations
gen_combn(1:4, n = 2)
#> [[1]]
#> [1] 1 2
#> 
#> [[2]]
#> [1] 1 3
#> 
#> [[3]]
#> [1] 1 4
#> 
#> [[4]]
#> [1] 2 3
#> 
#> [[5]]
#> [1] 2 4
#> 
#> [[6]]
#> [1] 3 4
  • statistical test which returns a extensible tibble
stat_test(mini_diamond, y = price, x = cut, .by = clarity)
#> # A tibble: 24 × 9
#>    y     clarity group1 group2    n1    n2 p      plim psymbol
#>    <chr> <chr>   <chr>  <chr>  <int> <int> <chr> <dbl> <chr>  
#>  1 price I1      Fair   Good       5     5 0.31   1.01 NS     
#>  2 price I1      Fair   Ideal      5     4 0.90   1.01 NS     
#>  3 price I1      Good   Ideal      5     4 0.19   1.01 NS     
#>  4 price IF      Fair   Good       4     5 0.063  1.01 NS     
#>  5 price IF      Fair   Ideal      4     4 0.059  1.01 NS     
#>  6 price IF      Good   Ideal      5     4 1.0    1.01 NS     
#>  7 price SI1     Fair   Good       5     4 1.0    1.01 NS     
#>  8 price SI1     Fair   Ideal      5     5 1.0    1.01 NS     
#>  9 price SI1     Good   Ideal      4     5 0.41   1.01 NS     
#> 10 price SI2     Fair   Good       4     4 0.057  1.01 NS     
#> # … with 14 more rows
  • fold change calculation which returns a extensible tibble
stat_fc(mini_diamond, y = price, x = cut, .by = clarity)
#> # A tibble: 24 × 8
#>    y     clarity group1 group2     y1    y2    fc fc_fmt
#>    <chr> <chr>   <chr>  <chr>   <dbl> <dbl> <dbl> <chr> 
#>  1 price I1      Fair   Good    4695. 2760. 1.70  1.7x  
#>  2 price I1      Fair   Ideal   4695. 4249  1.11  1.1x  
#>  3 price I1      Good   Ideal   2760. 4249  0.649 0.65x 
#>  4 price IF      Fair   Good    2016  1044. 1.93  1.9x  
#>  5 price IF      Fair   Ideal   2016   962. 2.10  2.1x  
#>  6 price IF      Good   Ideal   1044.  962. 1.09  1.1x  
#>  7 price SI1     Fair   Good    5844. 3227. 1.81  1.8x  
#>  8 price SI1     Fair   Ideal   5844. 3877. 1.51  1.5x  
#>  9 price SI1     Good   Ideal   3227. 3877. 0.832 0.83x 
#> 10 price SI2     Fair   Good   13162. 6539. 2.01  2.0x  
#> # … with 14 more rows
  • calculate phi coefficient of two binary variables
data <- matrix(c(10, 8, 14, 18), nrow = 2)
stat_phi(data)
#> [1] 0.1134241

IO

  • get the command line arguments
cmdargs()
#> $wd
#> [1] "/home/william/rpkg/baizer"
#> 
#> $R_env
#> [1] "/home/william/software/mambaforge/envs/baizer/lib/R/bin/exec/R"
#> 
#> $script_path
#> character(0)
#> 
#> $script_dir
#> character(0)
#> 
#> $env_configs
#> [1] "--slave"                               
#> [2] "--no-save"                             
#> [3] "--no-restore"                          
#> [4] "-f"                                    
#> [5] "/tmp/Rtmpus1DLR/callr-scr-73b34fef3f99"

cmdargs("R_env")
#> [1] "/home/william/software/mambaforge/envs/baizer/lib/R/bin/exec/R"
  • detect whether directory is empty recursively, and detect whether file is empty recursively
# create an empty directory
dir.create("some/deep/path/in/a/folder", recursive = TRUE)
empty_dir("some/deep/path/in/a/folder")
#> [1] TRUE

# create an empty file
file.create("some/deep/path/in/a/folder/there_is_a_file.txt")
#> [1] TRUE
empty_dir("some/deep/path/in/a/folder")
#> [1] FALSE
empty_file("some/deep/path/in/a/folder/there_is_a_file.txt", strict = TRUE)
#> [1] TRUE

# create a file with only character of length 0
write("", "some/deep/path/in/a/folder/there_is_a_file.txt")
empty_file("some/deep/path/in/a/folder/there_is_a_file.txt", strict = TRUE)
#> [1] FALSE
empty_file("some/deep/path/in/a/folder/there_is_a_file.txt")
#> [1] TRUE

# clean
unlink("some", recursive = TRUE)
  • read excel file
# read_excel("mini_diamond.xlsx")
  • write a tibble, or a list of tibbles into an excel file
# write_excel(mini_diamond, "mini_diamond.xlsx")

# Ldf <- list(mini_diamond[1:3, ], mini_diamond[4:6, ])
# write_excel(Ldf, '2sheets.xlsx')
  • read multi-sheet excel file as a list of tibbles
# read_excel_list("mini_diamond.xlsx")
  • read front matter markdown
# read_fmmd("markdown_file.md")
  • fetch remote files via sftp
# sftp_con <- sftp_connect(server='remote_host', port=22,
#                         user='username', password = "password", wd='~')
#
# sftp_download(sftp_con,
#    path=c('t1.txt', 't2.txt'),
#    to=c('path1.txt', 'path2.txt')
# )
  • list files from remote server via sftp
# sftp_ls(sftp_con, 'your/dir')

S3 classes in baizer

tbflt

  • save a series of filter conditions, and support logical operation among conditions
  • use filterC to apply tbflt on dplyr::filter
c1 <- tbflt(cut == "Fair")
c2 <- tbflt(x > 8)
c1 | c2
#> <quosure>
#> expr: ^cut == "Fair" | x > 8
#> env:  0x55f4a327c4e0

mini_diamond %>%
  filterC(c1) %>%
  head(5)
#> # A tibble: 5 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-1   1.02 Fair  SI1      3027  6.25  6.18
#> 2 id-6   2.02 Fair  SI2     14080  8.33  8.37
#> 3 id-10  0.7  Fair  VVS1     1691  5.56  5.41
#> 4 id-12  0.71 Fair  IF       3205  5.87  5.81
#> 5 id-18  0.34 Fair  VVS1     1012  4.8   4.76

mini_diamond %>%
  filterC(!c1) %>%
  head(5)
#> # A tibble: 5 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-2   1.51 Good  VS2     11746  7.27  7.18
#> 2 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#> 3 id-4   1.54 Ideal SI2      9452  7.43  7.45
#> 4 id-5   0.72 Ideal VS1      2498  5.73  5.77
#> 5 id-7   0.27 Good  VVS1      752  4.1   4.07

mini_diamond %>% filterC(c1 & c2)
#> # A tibble: 3 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-6   2.02 Fair  SI2     14080  8.33  8.37
#> 2 id-48  2.01 Fair  I1       7294  8.3   8.19
#> 3 id-68  2.32 Fair  SI1     18026  8.47  8.31
  • stricter limitation to avoid the unexpected default behavior
# default behavior of dplyr::filter, use column in data at first
x <- 8
mini_diamond %>% dplyr::filter(y > x)
#> # A tibble: 53 × 7
#>    id    carat cut   clarity price     x     y
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#>  1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#>  2 id-4   1.54 Ideal SI2      9452  7.43  7.45
#>  3 id-5   0.72 Ideal VS1      2498  5.73  5.77
#>  4 id-6   2.02 Fair  SI2     14080  8.33  8.37
#>  5 id-8   0.51 Good  SI2      1029  5.05  5.08
#>  6 id-11  1.02 Good  VVS1     7861  6.37  6.4 
#>  7 id-13  0.56 Ideal SI1      1633  5.31  5.32
#>  8 id-14  0.3  Ideal VVS2      812  4.33  4.39
#>  9 id-15  0.28 Good  IF        612  4.09  4.12
#> 10 id-16  0.41 Good  I1        467  4.7   4.74
#> # … with 43 more rows

# so the default behavior of filterC is just like that
# but if you want y > 8, and the defination of cond is far away from
# its application, the results may be unexpected

x <- 8
cond <- tbflt(y > x)
mini_diamond %>% filterC(cond)
#> # A tibble: 53 × 7
#>    id    carat cut   clarity price     x     y
#>    <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#>  1 id-3   0.52 Ideal VVS1     2029  5.15  5.18
#>  2 id-4   1.54 Ideal SI2      9452  7.43  7.45
#>  3 id-5   0.72 Ideal VS1      2498  5.73  5.77
#>  4 id-6   2.02 Fair  SI2     14080  8.33  8.37
#>  5 id-8   0.51 Good  SI2      1029  5.05  5.08
#>  6 id-11  1.02 Good  VVS1     7861  6.37  6.4 
#>  7 id-13  0.56 Ideal SI1      1633  5.31  5.32
#>  8 id-14  0.3  Ideal VVS2      812  4.33  4.39
#>  9 id-15  0.28 Good  IF        612  4.09  4.12
#> 10 id-16  0.41 Good  I1        467  4.7   4.74
#> # … with 43 more rows

cond <- tbflt(y > 8)
mini_diamond %>% filterC(cond)
#> # A tibble: 5 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-6   2.02 Fair  SI2     14080  8.33  8.37
#> 2 id-48  2.01 Fair  I1       7294  8.3   8.19
#> 3 id-49  2.16 Ideal I1       8709  8.31  8.26
#> 4 id-68  2.32 Fair  SI1     18026  8.47  8.31
#> 5 id-97  2.61 Good  SI2     13784  8.66  8.57


# to avoid this, set usecol=FALSE. An error will be raised for warning you
# to change the variable name
# mini_diamond %>% filterC(cond, usecol=FALSE)


# you can always ignore this argument if you know how to use .env or !!
x <- 8
cond1 <- tbflt(y > !!x)
mini_diamond %>% filterC(cond1)
#> # A tibble: 5 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-6   2.02 Fair  SI2     14080  8.33  8.37
#> 2 id-48  2.01 Fair  I1       7294  8.3   8.19
#> 3 id-49  2.16 Ideal I1       8709  8.31  8.26
#> 4 id-68  2.32 Fair  SI1     18026  8.47  8.31
#> 5 id-97  2.61 Good  SI2     13784  8.66  8.57

cond2 <- tbflt(y > .env$x)
mini_diamond %>% filterC(cond1)
#> # A tibble: 5 × 7
#>   id    carat cut   clarity price     x     y
#>   <chr> <dbl> <chr> <chr>   <int> <dbl> <dbl>
#> 1 id-6   2.02 Fair  SI2     14080  8.33  8.37
#> 2 id-48  2.01 Fair  I1       7294  8.3   8.19
#> 3 id-49  2.16 Ideal I1       8709  8.31  8.26
#> 4 id-68  2.32 Fair  SI1     18026  8.47  8.31
#> 5 id-97  2.61 Good  SI2     13784  8.66  8.57

dev

  • add #' into each line of codes for roxygen examples
roxygen_fmt(
  "
code line1
code line2
"
)
#> 
#> #' code line1
#> #' code line2
#> #'
  • use aliases for function arguments
# set y, z as aliases of x when create a function
func <- function(x = 1, y = NULL, z = NULL) {
  x <- alias_arg(x, y, z, default = x)
  return(x)
}

func()
#> [1] 1

func(x = 8)
#> [1] 8

func(z = 10)
#> [1] 10
  • check arguments by custom function
x <- 1
y <- 3
z <- NULL

func <- function(x = NULL, y = NULL, z = NULL) {
  if (check_arg(x, y, z, n = 2)) {
    print("As expected, two arguments is not NULL")
  }

  if (check_arg(x, y, z, n = 1, method = ~ .x < 2)) {
    print("As expected, one argument less than 2")
  }
}

Code of Conduct

Please note that the baizer project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.

Metadata

Version

0.8.0

License

Unknown

Platforms (75)

    Darwin
    FreeBSD
    Genode
    GHCJS
    Linux
    MMIXware
    NetBSD
    none
    OpenBSD
    Redox
    Solaris
    WASI
    Windows
Show all
  • aarch64-darwin
  • aarch64-genode
  • aarch64-linux
  • aarch64-netbsd
  • aarch64-none
  • aarch64_be-none
  • arm-none
  • armv5tel-linux
  • armv6l-linux
  • armv6l-netbsd
  • armv6l-none
  • armv7a-darwin
  • armv7a-linux
  • armv7a-netbsd
  • armv7l-linux
  • armv7l-netbsd
  • avr-none
  • i686-cygwin
  • i686-darwin
  • i686-freebsd
  • i686-genode
  • i686-linux
  • i686-netbsd
  • i686-none
  • i686-openbsd
  • i686-windows
  • javascript-ghcjs
  • loongarch64-linux
  • m68k-linux
  • m68k-netbsd
  • m68k-none
  • microblaze-linux
  • microblaze-none
  • microblazeel-linux
  • microblazeel-none
  • mips-linux
  • mips-none
  • mips64-linux
  • mips64-none
  • mips64el-linux
  • mipsel-linux
  • mipsel-netbsd
  • mmix-mmixware
  • msp430-none
  • or1k-none
  • powerpc-netbsd
  • powerpc-none
  • powerpc64-linux
  • powerpc64le-linux
  • powerpcle-none
  • riscv32-linux
  • riscv32-netbsd
  • riscv32-none
  • riscv64-linux
  • riscv64-netbsd
  • riscv64-none
  • rx-none
  • s390-linux
  • s390-none
  • s390x-linux
  • s390x-none
  • vc4-none
  • wasm32-wasi
  • wasm64-wasi
  • x86_64-cygwin
  • x86_64-darwin
  • x86_64-freebsd
  • x86_64-genode
  • x86_64-linux
  • x86_64-netbsd
  • x86_64-none
  • x86_64-openbsd
  • x86_64-redox
  • x86_64-solaris
  • x86_64-windows