MyNixOS website logo
Description

Wrapper for SQL Server bcp Utility.

Provides functions to utilize a command line utility that does bulk inserts and exports from SQL Server databases.

bcputility

CRAN status R-CMD-check Lifecycle: stable

bcputility is a wrapper for the command line utility program from SQL Server that does bulk imports/exports. The package assumes that bcp is already installed and is on the system search path. For large inserts to SQL Server over an ODBC connection (e.g. with the "DBI" package), writes can take a very long time as each row generates an individual insert statement. The bcp Utility greatly improves performance of large writes by using bulk inserts.

An export function is provided for convenience, but likely will not significantly improve performance over other methods.

Installation

You can install the released version of bcputility from CRAN with:

install.packages("bcputility")

Install the development version with:

devtools::install_github("tomroh/bcputility")

If bcp and sqlcmd is not on the system path or you want to override the default, set the option with the full file path:

options(bcputility.bcp.path = "<path-to-bcp>")
options(bcputility.sqlcmd.path = "<path-to-sqlcmd>")

Benchmarks

Benchmarks were performed with a local installation of SQL Server Express. When testing with a remote SQL Server, performance of bcp over odbc was further improved.

Import

library(DBI)
library(data.table)
library(bcputility)
server <- Sys.getenv('MSSQL_SERVER')
database <- Sys.getenv('MSSQL_DB')
driver <- 'ODBC Driver 17 for SQL Server'
set.seed(11)
n <- 1000000
importTable <- data.frame(
  int = sample(x = seq(1L, 10000L, 1L), size = n, replace = TRUE),
  numeric = sample(x = seq(0, 1, length.out = n/100), size = n,
    replace = TRUE),
  character = sample(x = state.abb, size = n, replace = TRUE),
  factor = sample(x = factor(x = month.abb, levels = month.abb),
    size = n, replace = TRUE),
  logical = sample(x = c(TRUE, FALSE), size = n, replace = TRUE),
  date = sample(x = seq(as.Date('2022-01-01'), as.Date('2022-12-31'),
    by = 'days'), size = n, replace = TRUE),
  datetime = sample(x = seq(as.POSIXct('2022-01-01 00:00:00'),
    as.POSIXct('2022-12-31 23:59:59'), by = 'min'), size = n, replace = TRUE)
)
connectArgs <- makeConnectArgs(server = server, database = database)
con <- DBI::dbConnect(odbc::odbc(),
                      Driver = "SQL Server",
                      Server = server,
                      Database = database)
importResults <- microbenchmark::microbenchmark(
  bcpImport1000 = {
    bcpImport(importTable,
              connectargs = connectArgs,
              table = 'importTable1',
              bcpOptions = list("-b", 1000, "-a", 4096, "-e", 10),
              overwrite = TRUE,
              stdout = FALSE)
    },
  bcpImport10000 = {
    bcpImport(importTable,
              connectargs = connectArgs,
              table = 'importTable2',
              bcpOptions = list("-b", 10000, "-a", 4096, "-e", 10),
              overwrite = TRUE,
              stdout = FALSE)
  },
  bcpImport50000 = {
    bcpImport(importTable,
              connectargs = connectArgs,
              table = 'importTable3',
              bcpOptions = list("-b", 50000, "-a", 4096, "-e", 10),
              overwrite = TRUE,
              stdout = FALSE)
  },
  bcpImport100000 = {
    bcpImport(importTable,
      connectargs = connectArgs,
      table = 'importTable4',
      bcpOptions = list("-b", 100000, "-a", 4096, "-e", 10),
      overwrite = TRUE,
      stdout = FALSE)
  },
  dbWriteTable = {
    con <- DBI::dbConnect(odbc::odbc(),
      Driver = driver,
      Server = server,
      Database = database,
      trusted_connection = 'yes')
    DBI::dbWriteTable(con, name = 'importTable5', importTable, overwrite = TRUE)
    },
  times = 30L,
  unit = 'seconds'
)
importResults
exprminlqmeanmedianuqmaxneval
bcpImport100015.01738516.61086817.40555517.65626518.10099019.4448230
bcpImport1000010.09126610.65792610.92673810.91657711.20818411.4602730
bcpImport500008.9824989.3375099.6773759.5715269.89617910.7770930
bcpImport1000008.7695989.3034739.5629219.5819279.85535510.3694930
dbWriteTable13.57095613.82070715.15450514.15900216.37898627.2881930

Time in seconds

Export Table

Note:bcp exports of data may not match the format of fwrite. dateTimeAs = 'write.csv' was used to make timings comparable, which decreased the performance of "data.table". Optimized write formats for date times from fwrite outperforms bcp for data that is small enough to be pulled into memory.

exportResults <- microbenchmark::microbenchmark(
  bcpExportChar = {
    bcpExport('inst/benchmarks/test1.csv',
              connectargs = connectArgs,
              table = 'importTableInit',
              fieldterminator = ',',
              stdout = FALSE)
    },
  bcpExportNchar = {
    bcpExport('inst/benchmarks/test2.csv',
              connectargs = connectArgs,
              table = 'importTableInit',
              fieldterminator = ',',
              stdout = FALSE)
  },
  fwriteQuery = {
    fwrite(DBI::dbReadTable(con, 'importTableInit'),
           'inst/benchmarks/test3.csv', dateTimeAs = 'write.csv',
           col.names = FALSE)
  },
  times = 30L,
  unit = 'seconds'
)
exportResults
exprminlqmeanmedianuqmaxneval
bcpExportChar2.5656542.7274772.7956702.7566852.7922913.35232530
bcpExportNchar2.5893672.7041352.7657842.7349572.7972863.47907430
fwriteQuery7.4297317.6028537.6458527.6547307.7036347.86841930

Time in seconds

Export Query

query <- 'SELECT * FROM [dbo].[importTable1] WHERE int < 1000'
queryResults <- microbenchmark::microbenchmark(
  bcpExportQueryChar = {
    bcpExport('inst/benchmarks/test4.csv',
              connectargs = connectArgs,
              query = query,
              fieldterminator = ',',
              stdout = FALSE)
  },
  bcpExportQueryNchar = {
    bcpExport('inst/benchmarks/test5.csv',
              connectargs = connectArgs,
              query = query,
              fieldterminator = ',',
              stdout = FALSE)
  },
  fwriteQuery = {
    fwrite(DBI::dbGetQuery(con, query),
           'inst/benchmarks/test6.csv', dateTimeAs = 'write.csv',
           col.names = FALSE)
  },
  times = 30L,
  unit = 'seconds'
)
queryResults
exprminlqmeanmedianuqmaxneval
bcpExportQueryChar0.34444910.43973170.45571190.44909240.46155730.723718230
bcpExportQueryNchar0.33052650.44447050.44126700.45006900.46059710.481589430
fwriteQuery0.67378790.71419330.74213770.73119980.75482330.914355530

Time in seconds

Import Geometry

Importing spatial data from 'sf' objects is also supported. The sql statements after import are to produce equivalent tables in the database.

library(sf)
nc <- st_read(system.file("gpkg/nc.gpkg", package = "sf"))
divN <- 10
shp1 <- cbind(nc[sample.int(nrow(nc), n / divN, replace = TRUE),],
  importTable[seq_len(n / divN), ],
  id = seq_len(n / divN))
geometryResults <- microbenchmark::microbenchmark(
  bcpImportGeometry = {
    bcpImport(shp1,
      connectargs = connectArgs,
      table = 'shp1',
      overwrite = TRUE,
      stdout = FALSE,
      spatialtype = 'geometry',
      bcpOptions = list("-b", 50000, "-a", 4096, "-m", 0))
  },
  odbcImportGeometry = {
    con <- DBI::dbConnect(odbc::odbc(),
      driver = driver,
      server = server,
      database = database,
      trusted_connection = 'yes')
    tableName <- 'shp2'
    spatialType <- 'geometry'
    geometryColumn <- 'geom'
    binaryColumn <- 'geomWkb'
    srid <- sf::st_crs(nc)$epsg
    shpBin2 <- data.table(shp1)
    data.table::set(x = shpBin2, j = binaryColumn,
      value = blob::new_blob(lapply(sf::st_as_binary(shpBin2[[geometryColumn]]),
        as.raw)))
    data.table::set(x = shpBin2, j = geometryColumn, value = NULL)
    dataTypes <- DBI::dbDataType(con, shpBin2)
    dataTypes[binaryColumn] <- 'varbinary(max)'
    DBI::dbWriteTable(conn = con, name = tableName, value = shpBin2,
      overwrite = TRUE, field.types = dataTypes)
    DBI::dbExecute(conn = con, sprintf('alter table %1$s add %2$s %3$s;',
      tableName, geometryColumn, spatialType))
    DBI::dbExecute(conn = con,
      sprintf('UPDATE %1$s
    SET geom = %3$s::STGeomFromWKB([%4$s], %2$d);
    ALTER TABLE %1$s DROP COLUMN [%4$s];', tableName, srid, spatialType,
        binaryColumn)
    )
  },
  bcpImportGeography = {
    bcpImport(shp1,
      connectargs = connectArgs,
      table = 'shp3',
      overwrite = TRUE,
      stdout = FALSE,
      spatialtype = 'geography',
      bcpOptions = list("-b", 50000, "-a", 4096, "-m", 0))
  },
  odbcImportGeography = {
    con <- DBI::dbConnect(odbc::odbc(),
      driver = driver,
      server = server,
      database = database,
      trusted_connection = 'yes')
    tableName <- 'shp4'
    spatialType <- 'geography'
    geometryColumn <- 'geom'
    binaryColumn <- 'geomWkb'
    srid <- sf::st_crs(nc)$epsg
    shpBin4 <- data.table(shp1)
    data.table::set(x = shpBin4, j = binaryColumn,
      value = blob::new_blob(lapply(sf::st_as_binary(shpBin4[[geometryColumn]]),
        as.raw)))
    data.table::set(x = shpBin4, j = geometryColumn, value = NULL)
    dataTypes <- DBI::dbDataType(con, shpBin4)
    dataTypes[binaryColumn] <- 'varbinary(max)'
    DBI::dbWriteTable(conn = con, name = tableName, value = shpBin4,
      overwrite = TRUE, field.types = dataTypes)
    DBI::dbExecute(conn = con, sprintf('alter table %1$s add %2$s %3$s;',
      tableName, geometryColumn, spatialType))
    DBI::dbExecute(conn = con,
      sprintf('UPDATE %1$s
    SET geom = %3$s::STGeomFromWKB([%4$s], %2$d);
    ALTER TABLE %1$s DROP COLUMN [%4$s];', tableName, srid, spatialType,
        binaryColumn)
    )
    DBI::dbExecute(conn = con,
      sprintf(
        'UPDATE %1$s SET [%2$s] = [%2$s].MakeValid().ReorientObject().MakeValid()
   WHERE [%2$s].MakeValid().EnvelopeAngle() > 90;',
        tableName, geometryColumn))
  },
  times = 30L,
  unit = 'seconds'
)
geometryResults
exprminlqmeanmedianuqmaxneval
bcpImportGeometry18.0145119.4874720.6883420.4513621.7421226.8703330
odbcImportGeometry18.2972120.6336322.3504421.2908724.0449027.8111230
bcpImportGeography71.2326075.0458882.6528676.3698596.68469102.7090930
odbcImportGeography73.2981876.1248184.5843277.9341997.36155107.0018630

Time in seconds.

Metadata

Version

0.4.3

License

Unknown

Platforms (75)

    Darwin
    FreeBSD
    Genode
    GHCJS
    Linux
    MMIXware
    NetBSD
    none
    OpenBSD
    Redox
    Solaris
    WASI
    Windows
Show all
  • aarch64-darwin
  • aarch64-genode
  • aarch64-linux
  • aarch64-netbsd
  • aarch64-none
  • aarch64_be-none
  • arm-none
  • armv5tel-linux
  • armv6l-linux
  • armv6l-netbsd
  • armv6l-none
  • armv7a-darwin
  • armv7a-linux
  • armv7a-netbsd
  • armv7l-linux
  • armv7l-netbsd
  • avr-none
  • i686-cygwin
  • i686-darwin
  • i686-freebsd
  • i686-genode
  • i686-linux
  • i686-netbsd
  • i686-none
  • i686-openbsd
  • i686-windows
  • javascript-ghcjs
  • loongarch64-linux
  • m68k-linux
  • m68k-netbsd
  • m68k-none
  • microblaze-linux
  • microblaze-none
  • microblazeel-linux
  • microblazeel-none
  • mips-linux
  • mips-none
  • mips64-linux
  • mips64-none
  • mips64el-linux
  • mipsel-linux
  • mipsel-netbsd
  • mmix-mmixware
  • msp430-none
  • or1k-none
  • powerpc-netbsd
  • powerpc-none
  • powerpc64-linux
  • powerpc64le-linux
  • powerpcle-none
  • riscv32-linux
  • riscv32-netbsd
  • riscv32-none
  • riscv64-linux
  • riscv64-netbsd
  • riscv64-none
  • rx-none
  • s390-linux
  • s390-none
  • s390x-linux
  • s390x-none
  • vc4-none
  • wasm32-wasi
  • wasm64-wasi
  • x86_64-cygwin
  • x86_64-darwin
  • x86_64-freebsd
  • x86_64-genode
  • x86_64-linux
  • x86_64-netbsd
  • x86_64-none
  • x86_64-openbsd
  • x86_64-redox
  • x86_64-solaris
  • x86_64-windows