MyNixOS website logo
Description

Best Split Selection Modeling for Low-Dimensional Data.

Functions to generate or sample from all possible splits of features or variables into a number of specified groups. Also computes the best split selection estimator (for low-dimensional data) as defined in Christidis, Van Aelst and Zamar (2019) <arXiv:1812.05678>.

Build Status CRAN_Status_Badge Downloads

splitSelect

This package provides functions for generating all possible splits of variables into groups, and computing the best split selection regression estimator for low-dimensional data.


Installation

You can install the stable version on R CRAN.

install.packages("splitSelect", dependencies = TRUE)

You can install the development version from GitHub.

library(devtools)
devtools::install_github("AnthonyChristidis/splitSelect")

Usage

Here is some code to generate all possible splits of variables into groups.

# Loading library
library(splitSelect)

# Setting number of variables and groups
p <- 8
G <- 4
use.all <- TRUE

# Generate the number of partitions
my.partitions <- generate_partitions(p, G, use.all=use.all)
my.partitions

# Generate the number of splits
nsplit(p, G, use.all=use.all)
# Generate the number of splits (fixed partition)
nsplit(p, G, use.all=use.all,
       fix.partition=matrix(c(2,2,2,2), nrow=1))

# Generate the splits
all.splits <- generate_splits(p, G, use.all=use.all)
head(all.splits)
nrow(all.splits)
# Generate the splits (fixed partition)
all.splits <- generate_splits(p, G, use.all=use.all,
                              fix.partition=matrix(c(2,2,2,2), nrow=1))
head(all.splits)
nrow(all.splits)

# Generate samples of splits
sample.splits <- rsplit(10000, p, G, fix.partition=matrix(c(2,2,2,2), nrow=1))
sample.splits

Here is some code to apply to compute the best split selection estimator for simulated data with spurious correlation in the training set.

# Download the packages
install.packages("simTargetCov")
install.packages("glmnet")
install.packages("SplitReg")

# Setting the parameters
p <- 6
n <- 30
n.test <- 5000
group.beta <- 5
beta <- c(rep(1, 2), rep(group.beta, p-2))
rho <- 0.1
r <- 0.9
SNR <- 3
# Creating the target matrix with "kernel" set to rho
target_cor <- function(r, p){
  Gamma <- diag(p)
  for(i in 1:(p-1)){
    for(j in (i+1):p){
      Gamma[i,j] <- Gamma[j,i] <- r^(abs(i-j))
    }
  }
  return(Gamma)
}
# AR Correlation Structure
Sigma.r <- target_cor(r, p)
Sigma.rho <- target_cor(rho, p)
sigma.epsilon <- as.numeric(sqrt((t(beta) %*% Sigma.rho %*% beta)/SNR))

# Number of cores
nb.cores <- parallel::detectCores()-1
# Registering the clusters
cl <- parallel::makeCluster(nb.cores)
doParallel::registerDoParallel(cl)

# Set the seed
set.seed(0)

# Simulate some data
x.train <- simTargetCov::simTargetCov(n=n, p=p, target=Sigma.r)
y.train <- 1 + x.train %*% beta + rnorm(n=n, mean=0, sd=sigma.epsilon)
x.test <- mvnfast::rmvn(n.test, mu=rep(0,p), sigma=Sigma.rho)
y.test <- 1 + x.test %*% beta + rnorm(n.test, sd=sigma.epsilon)

# Best Split Selection for Regression
system.time(
  split.out <- cv.splitSelect(x.train, y.train, G=2, use.all=TRUE,
                              fix.partition=list(matrix(c(2,4,
                                                          3,3), ncol=2, byrow=TRUE)), fix.split=NULL,
                              intercept=TRUE, group.model="glmnet", alpha=0, nfolds=10,
                              parallel=TRUE, cores=nb.cores)
)
split.predictions <- predict(split.out, newx=x.test)
mean((split.predictions-y.test)^2)/sigma.epsilon^2

# Ending the cluster
parallel::stopCluster(cl)

# Ridge Regression
cv.ridge <- glmnet::cv.glmnet(x.train, y.train, alpha=0)
ridge <- glmnet::glmnet(x.train, y.train, alpha=0, lambda=cv.ridge$lambda.min) 
ridge.predictions <- predict(ridge, newx=x.test)
mean((ridge.predictions-y.test)^2)/sigma.epsilon^2

# Lasso
cv.lasso <- glmnet::cv.glmnet(x.train, y.train, alpha=1)
lasso <- glmnet::glmnet(x.train, y.train, alpha=1, lambda=cv.lasso$lambda.min)
lasso.predictions <- predict(lasso, newx=x.test)
mean((lasso.predictions-y.test)^2)/sigma.epsilon^2

# Elastic Net
cv.elastic <- glmnet::cv.glmnet(x.train, y.train, alpha=3/4)
elastic <- glmnet::glmnet(x.train, y.train, alpha=3/4, lambda=cv.elastic$lambda.min)
elastic.predictions <- predict(elastic, newx=x.test)
mean((elastic.predictions-y.test)^2)/sigma.epsilon^2

# SplitReg
cv.splitreg <- SplitReg::cv.SplitReg(x.train, y.train, num_models=3, alpha=1e-2)
splitreg.predictions <- predict(cv.splitreg, newx=x.test)
mean((splitreg.predictions-y.test)^2)/sigma.epsilon^2

# Looking at the MSPEs for all the possible splits (out-of-sample)
split.mspes <-
  sapply(1:nrow(split.out$splits), function(x, n.test, x.test, split.out, y.test) 
  {mean((y.test-cbind(rep(1, n.test), x.test) %*% split.out$betas[,x])^2)}, 
  n.test, x.test, split.out, y.test)/sigma.epsilon^2
# Minimum MSPE for the splits (out-of-sample)
min(split.mspes)
split.mspes[split.out$optimal.split]
# Optimal splits comparison (out-of-sample)
split.out$splits[which.min(split.mspes),]
split.out$optimal.split.var
# Optimal betas comparison (out-of-sample)
split.out$betas[,which.min(split.mspes), drop=FALSE]
coef(split.out)

License

This package is free and open source software, licensed under GPL (>= 2).

Metadata

Version

1.0.3

License

Unknown

Platforms (75)

    Darwin
    FreeBSD
    Genode
    GHCJS
    Linux
    MMIXware
    NetBSD
    none
    OpenBSD
    Redox
    Solaris
    WASI
    Windows
Show all
  • aarch64-darwin
  • aarch64-genode
  • aarch64-linux
  • aarch64-netbsd
  • aarch64-none
  • aarch64_be-none
  • arm-none
  • armv5tel-linux
  • armv6l-linux
  • armv6l-netbsd
  • armv6l-none
  • armv7a-darwin
  • armv7a-linux
  • armv7a-netbsd
  • armv7l-linux
  • armv7l-netbsd
  • avr-none
  • i686-cygwin
  • i686-darwin
  • i686-freebsd
  • i686-genode
  • i686-linux
  • i686-netbsd
  • i686-none
  • i686-openbsd
  • i686-windows
  • javascript-ghcjs
  • loongarch64-linux
  • m68k-linux
  • m68k-netbsd
  • m68k-none
  • microblaze-linux
  • microblaze-none
  • microblazeel-linux
  • microblazeel-none
  • mips-linux
  • mips-none
  • mips64-linux
  • mips64-none
  • mips64el-linux
  • mipsel-linux
  • mipsel-netbsd
  • mmix-mmixware
  • msp430-none
  • or1k-none
  • powerpc-netbsd
  • powerpc-none
  • powerpc64-linux
  • powerpc64le-linux
  • powerpcle-none
  • riscv32-linux
  • riscv32-netbsd
  • riscv32-none
  • riscv64-linux
  • riscv64-netbsd
  • riscv64-none
  • rx-none
  • s390-linux
  • s390-none
  • s390x-linux
  • s390x-none
  • vc4-none
  • wasm32-wasi
  • wasm64-wasi
  • x86_64-cygwin
  • x86_64-darwin
  • x86_64-freebsd
  • x86_64-genode
  • x86_64-linux
  • x86_64-netbsd
  • x86_64-none
  • x86_64-openbsd
  • x86_64-redox
  • x86_64-solaris
  • x86_64-windows