Ensemble Meta-Prediction Framework.
R package MetaIntegration
Ensemble Meta-Prediction Framework to Integrate Multiple Regression Models
Overview
This R
package is an ensemble meta-prediction framework to integrate multiple regression models into a current study.
Installation
If the devtools package is not yet installed, install it first:
install.packages('devtools')
# install the package from Github:
devtools::install_github('umich-biostatistics/MetaIntegration')
Once installed, load the package:
library(MetaIntegration)
Example Usage
For this example, we simulate data and test each of the functions.
Simulate data
############## Generating 1 internal dataset of size n ###########################################
# Full model: Y|X1, X2, X3, X4, B
# (X1, X2, X3, X4, B) follows normal distribution with mean zero, variance one and correlation 0.3
# Y|X1, X2, X3, X4, B follows Bernoulli[expit(-1-0.5X+0.5B)], where expit(x)=exp(x)/[1+exp(x)]
##################################################################################################
set.seed(2333)
n = 200
data.n = data.frame(matrix(ncol = 5, nrow = n))
colnames(data.n) = c('Y', 'X1', 'X2', 'X3', 'B')
data.n[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(n, rep(0,4), diag(0.7,4)+0.3)
############# Function 1 #########################################
data.n$Y = rbinom(n, 1, expit(-1 - 0.5*(data.n$X1 + data.n$X2 + data.n$X3) + 0.5*data.n$B))
############# Generate k=3 external models #########################################
# Full model: Y|X1, X2, X3, B
# Reduced external model 1: Y|X1, X2 with sample size m1
# Reduced external model 2: Y|X1, X3 with sample size m2
# Reduced external model 3: Y|X1, X2, X3 with sample size m3
####################################################################################
# generate data from the full model first, then fit the reduced logistic regression
# on the data to obtain the beta estiamtes and the corresponsing estimated variance
####################################################################################
m1 = 500
m2 = 2000
m3 = 30000
data.m1 = data.frame(matrix(ncol = 5, nrow = m1))
data.m2 = data.frame(matrix(ncol = 5, nrow = m2))
data.m3 = data.frame(matrix(ncol = 5, nrow = m3))
names(data.m1) = names(data.m2) = names(data.m3) = c('Y', 'X1', 'X2', 'X3', 'B')
data.m1[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m1, rep(0,4), diag(0.7,4)+0.3)
data.m2[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m2, rep(0,4), diag(0.7,4)+0.3)
data.m3[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m3, rep(0,4), diag(0.7,4)+0.3)
data.m1$Y = rbinom(m1, 1, expit(-1 - 0.5*(data.m1$X1 + data.m1$X2 + data.m1$X3) + 0.5*data.m1$B))
data.m2$Y = rbinom(m2, 1, expit(-1 - 0.5*(data.m2$X1 + data.m2$X2 + data.m2$X3) + 0.5*data.m2$B))
data.m3$Y = rbinom(m3, 1, expit(-1 - 0.5*(data.m3$X1 + data.m3$X2 + data.m3$X3) + 0.5*data.m3$B))
#fit Y|X using logistic regression to obtain the external beta estimates
fit.E1 = glm(Y ~ X1 + X2, data = data.m1, family = binomial(link='logit'))
fit.E2 = glm(Y ~ X1 + X3, data = data.m2, family = binomial(link='logit'))
fit.E3 = glm(Y ~ X1 + X2 + X3, data = data.m3, family = binomial(link='logit'))
#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 1
beta.E1 = coef(fit.E1)
names(beta.E1) = c('int', 'X1', 'X2')
V.E1 = vcov(fit.E1)
#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 2
beta.E2 = coef(fit.E2)
names(beta.E2) = c('int','X1','X3')
V.E2 = vcov(fit.E2)
#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 3
beta.E3 = coef(fit.E3)
names(beta.E3) = c('int','X1','X2','X3')
V.E3 = vcov(fit.E3)
#Save all the external model information into lists for later use
betaHatExt_list = list(Ext1 = beta.E1, Ext2 = beta.E2, Ext3 = beta.E3)
CovExt_list = list(Ext1 = V.E1, Ext2 = V.E2, Ext3 = V.E3)
rho = list(Ext1 = n/m1, Ext2 = n/m2, Ext3 = n/m3)
fxnCC_LogReg()
Constraint maximum likelihood (CML) method for logistic regression (binary outcome Y).
fit.gamma.I = glm(Y ~ X1 + X2 + X3 + B, data = data.n, family = binomial(link='logit'))
gamma.I = coef(fit.gamma.I)
gamma.CML1 = fxnCC_LogReg(p=3,
q=5,
YInt=data.n$Y,
XInt=cbind(data.n$X1,data.n$X2),
BInt=cbind(data.n$X3,data.n$B),
betaHatExt=beta.E1,
gammaHatInt=gamma.I,
n=n,
tol=1e-8,
maxIter=400,
factor=1)[["gammaHat"]]
gamma.CML2 = fxnCC_LogReg(p=3,
q=5,
YInt=data.n$Y,
XInt=cbind(data.n$X1,data.n$X3),
BInt=cbind(data.n$X2, data.n$B),
betaHatExt=beta.E2,
gammaHatInt=c(gamma.I[1:2],gamma.I[4],gamma.I[3],gamma.I[5]),
n=n,
tol=1e-8,
maxIter=400,
factor=1)[["gammaHat"]]
gamma.CML2 = c(gamma.CML2[1:2], gamma.CML2[4], gamma.CML2[3], gamma.CML2[5])
gamma.CML3 = fxnCC_LogReg(p=4,
q=5,
YInt=data.n$Y,
XInt=cbind(data.n$X1,data.n$X2,data.n$X3),
BInt=cbind(data.n$B),
betaHatExt=beta.E3,
gammaHatInt=gamma.I,
n=n,
tol=1e-8,
maxIter=400,
factor=1)[["gammaHat"]]
asympVar_LogReg()
Asymptotic variance-covariance matrix for gamma_Int and gamma_CML for logistic regression (binary outcome Y).
asy.CML = asympVar_LogReg(k=3,
p=4,
q=5,
YInt=data.n$Y,
XInt=data.n[,c('X1','X2','X3')], #covariates that appeared in at least one external model
BInt=data.n$B, #covariates that not used in any of the external models
gammaHatInt=gamma.I,
betaHatExt_list=betaHatExt_list,
CovExt_list=CovExt_list,
rho=rho,
ExUncertainty=TRUE)
asyV.I = asy.CML[['asyV.I']] #variance of gamma.I
asyV.CML1 = asy.CML[['asyV.CML']][[1]] #variance of gamma.CML1
asyV.CML2 = asy.CML[['asyV.CML']][[2]] #variance of gamma.CML2
asyCov.CML1.I = asy.CML[['asyCov.CML.I']][[1]] #covariance of gamma.CML1 and gamma.I
asyCov.CML2.I = asy.CML[['asyCov.CML.I']][[2]] #covariance of gamma.CML2 and gamma.I
asyCov.CML12 = asy.CML[['asyCov.CML']][['12']] #covariance of gamma.CML1 and gamma.CML2
get_gamma_EB()
Calculate the empirical Bayes (EB) estimates.
gamma.EB1 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML1, asyV.I=asyV.I)[['gamma.EB']]
gamma.EB2 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML2, asyV.I=asyV.I)[['gamma.EB']]
gamma.EB3 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML3, asyV.I=asyV.I)[['gamma.EB']]
get_var_EB()
Using simulation to obtain the asymptotic variance-covariance matrix of gamma_EB, package corpcor and MASS are required.
#Get the asymptotic variance of the EB estimates
V.EB = get_var_EB(k=3,
q=5,
gamma.CML=c(gamma.CML1, gamma.CML2, gamma.CML3),
gamma.I = gamma.I,
asy.CML=asy.CML,
seed=2333,
nsim=2000)
asyV.EB1 = V.EB[['asyV.EB']][[1]] #variance of gamma.EB1
asyV.EB2 = V.EB[['asyV.EB']][[2]] #variance of gamma.EB2
asyV.EB3 = V.EB[['asyV.EB']][[3]] #variance of gamma.EB3
asyCov.EB1.I = V.EB[['asyCov.EB.I']][[1]] #covariance of gamma.EB1 and gamma.I
asyCov.EB2.I = V.EB[['asyCov.EB.I']][[2]] #covariance of gamma.EB2 and gamma.I
asyCov.EB3.I = V.EB[['asyCov.EB.I']][[3]] #covariance of gamma.EB2 and gamma.I
asyCov.EB12 = V.EB[['asyCov.EB']][['12']] #covariance of gamma.EB1 and gamma.EB2
asyCov.EB13 = V.EB[['asyCov.EB']][['13']] #covariance of gamma.EB1 and gamma.EB3
asyCov.EB23 = V.EB[['asyCov.EB']][['23']] #covariance of gamma.EB2 and gamma.EB3
get_OCW()
Obtain the proposed Optimal covariate-Weighted (OCW) estimates, package Rsolnp is required.
get_OCW(k=3,
q=5,
data.XB=data.n[,c('X1','X2','X3','B')],
gamma.EB=c(gamma.EB1, gamma.EB2, gamma.EB3),
V.EB=V.EB)
get_SCLearner()
Obtain the proposed Selective Coefficient-Learner (SC-Learner) estimates.
pred.matrix = matrix(c(1,1,1,0,0,
1,1,0,1,0,
1,1,1,1,0), 5, 3)
rownames(pred.matrix) = c('int','X1','X2','X3','B')
colnames(pred.matrix) = c('E1','E2','E3')
get_SCLearner(k=3,
q=5,
pred.matrix=pred.matrix,
gamma.EB=cbind(gamma.EB1, gamma.EB2, gamma.EB3),
V.EB)
Current Suggested Citation
Gu, T., Taylor, J.M.G. and Mukherjee, B. (2020). An ensemble meta-prediction framework to integrate multiple regression models into a current study. Manuscript in preparation.