Semiparametric Principal Stratification Analysis Beyond Monotonicity.
PSor Package Vignette
The goal of PSor is to estimate principal causal effects under principal stratification using a margin-free, variational-independent odds ratio sensitivity parameter, allowing analysis when monotonicity may not hold. The framework unifies the monotonicity assumption with the counterfactual intermediate independence assumption. The framework also assumes the mean principal ignorability. The package accompanies the paper “Semiparametric Principal Stratification Analysis Beyond Monotonicity” and provides point estimates, standard errors, and confidence intervals for both the conditionally doubly robust (CDR) and debiased machine learning (DML) estimators.
Installation
You can install the development version of PSor from GitHub with:
# install.packages("devtools")
devtools::install_github("deckardt98/PSor")
Example
This example demonstrates how to use PSor.fit to estimate principal causal effects with simulated data from our manuscript. To summarize, the data will include a binary treatment Z, a binary intermediate outcome D, a continuous final outcome Y, and baseline covariates, \mathbf{X}. Let Y(z) and D(z) respectively denote the potential final outcome and potential intermediate outcome under treatment value Z=z. Under principal stratification, the estimand of interest is the principal causal effect: $$\mu_{d_0d_1}=E{Y(1)-Y(0)|D(0)=d_0,D(1)=d_1}.$$ For example, under a noncompliance setup where $D$ denotes the actual treatment received, the principal strata variable $G=(D(0),D(1))$ can be interpreted as follows: $G=11$ represents always-takers, $G=01$ represents compliers, $G=00$ represents never-takers, and $G=10$ represents defiers. The primary estimand of interest is the complier average causal effect (CACE) within the stratum $G=01$. Next, we use a simulated dataset with four covariates to illustrate an example application of the package.
1. Load Package and Generate Data
First, we load the necessary packages and define a function to generate a simulated dataset. This simulated data will include a binary treatment Z, a binary intermediate outcome D, a continuous final outcome Y, and four covariates, X1-X4.
library(truncnorm)
expit <- function(x){return(exp(x)/(1+exp(x)))}
simu_full_data <- function(n, seed=20250917, theta){
# Input:
# n: sample size
# theta: odds ratio sensitivity parameter; theta = 1 assumes independence and theta = Inf assumes monotonicity
set.seed(seed)
Z <- D <- c()
# Simulate covariates
X1 <- rtruncnorm(n, a=-20, b=20, mean = 0, sd = 1)
X2 <- rtruncnorm(n, a=-20, b=20, mean = 0, sd = 1)
X3 <- rtruncnorm(n, a=-20, b=20, mean = 0, sd = 1)
X4 <- rbinom(n, size = 1, prob = 0.5)
if (theta==Inf){
X1 <- abs(X1)
X2 <- abs(X2)
X3 <- abs(X3)
probZ <- expit(0.1*(X1+X2+X3)+0.5*X4)
Z <- rbinom(n, size = 1, prob = probZ)
# Simulate G=(D(0),D(1))
probD1 <- expit(1.2*X4)
probD0 <- expit(-0.4-0.2*X1-0.2*X2-0.2*X3-0.2*X4)
prob11 <- probD0
prob01 <- probD1 - probD0
prob00 <- 1 - probD1
prob_matrix <- cbind(prob00, prob01, prob11)
# Simulate G from the categorical distribution
G <- apply(prob_matrix, 1, function(p) sample(0:2, size = 1, prob = p))
# Simulate D(1)
D1 <- as.numeric(I(G!=0))
# Simulate D(0)
D0 <- as.numeric(I(G==2))
# Compute observed intermediate outcome
D <- D1*Z+(1-Z)*D0
} else if (theta==1) {
probZ <- expit(0.1*(X1+X2+X3)+0.5*X4)
Z <- rbinom(n, size = 1, prob = probZ)
# Simulate (D(0),D(1))
probD1 <- expit(0.3*X1+0.4*X2+0.3*X3+0.5*X4)
probD0 <- expit(0.4*X1+0.3*X2+0.4*X3+0.5*X4)
prob11 <- probD0*probD1
prob10 <- probD0 - prob11
prob01 <- probD1 - prob11
prob00 <- 1-prob11-prob10-prob01
prob_matrix <- cbind(prob10, prob00, prob01, prob11)
jointD <- apply(prob_matrix, 1, function(p) sample(1:4, size = 1, prob = p))
# Simulate D(0)
D0 <- as.numeric(I(jointD==1|jointD==4))
# Simulate D(1)
D1 <- as.numeric(I(jointD==3|jointD==4))
# Compute observed intermediate outcome
D <- D1*Z+(1-Z)*D0
} else {
probZ <- expit(0.1*(X1+X2+X3)+0.5*X4)
Z <- rbinom(n, size = 1, prob = probZ)
# Simulate (D(0),D(1))
probD1 <- expit(0.3*X1+0.4*X2+0.3*X3+0.5*X4)
probD0 <- expit(0.4*X1+0.3*X2+0.4*X3+0.5*X4)
deltaX <- (1+(theta-1)*(probD1+probD0))^2-4*theta*(theta-1)*probD1*probD0
prob11 <- (1+(theta-1)*(probD1+probD0)-sqrt(deltaX))/2/(theta-1)
prob10 <- probD0 - prob11
prob01 <- probD1 - prob11
prob00 <- 1-prob11-prob10-prob01
prob_matrix <- cbind(prob10, prob00, prob01, prob11)
jointD <- apply(prob_matrix, 1, function(p) sample(1:4, size = 1, prob = p))
# Simulate D(0)
D0 <- as.numeric(I(jointD==1|jointD==4))
# Simulate D(1)
D1 <- as.numeric(I(jointD==3|jointD==4))
# Compute observed intermediate outcome
D <- D1*Z+(1-Z)*D0
}
# Simulate potential final outcome
meanY1 <- -1+D1+X1+3*X2+3*X3+3*X4
meanY0 <- 3-D0-1.5*X1+2*X2+2*X3-2*X4
Y1 <- rnorm(n = n, mean = meanY1, sd = 1)
Y0 <- rnorm(n = n, mean = meanY0, sd = 1)
Y <- Y1*Z+(1-Z)*Y0
return(as.data.frame(cbind(X1,X2,X3,X4,Z,D,Y)))
}
2. Run PSor.fit
Now, we simulate a sample data set assuming counterfactual intermediate independence with $\theta(\mathbf{X})=1$, and then call the main function to compute principal causal effects under either correctly specified odds ratio or incorrectly assumed monotonicity.
library(PSor)
library(SuperLearner)
#> Loading required package: nnls
#> Loading required package: gam
#> Loading required package: splines
#> Loading required package: foreach
#> Loaded gam 1.22-6
#> Super Learner
#> Version: 2.0-29
#> Package created on 2024-02-06
# Generate a data set under independence; or = 1
n = 500
theta = 1
df <- simu_full_data(n, theta=theta)
# Fit correctly specified odds ratio, or = 1
PSor.fit(
out.formula = Y~X1+X2+X3+X4,
ps.formula = D~X1+X2+X3+X4,
pro.formula = Z~X1+X2+X3+X4,
df = df,
out.name = "Y",
int.name = "D",
trt.name = "Z",
cov.names = c("X1","X2","X3","X4"),
or = 1,
SLmethods = c("SL.glm", "SL.rpart", "SL.nnet"),
n.fold = 5,
scale = "RD",
alpha = 0.05
)
#> CDR.Est CDR.SE CDR.ci.low CDR.ci.up DML.Est DML.SE DML.ci.low
#> Always-Takers 2.193 0.330 1.545 2.840 2.146 0.359 1.441
#> Compliers -0.198 0.444 -1.067 0.672 -0.239 0.461 -1.142
#> Never-Takers -3.278 0.397 -4.056 -2.500 -3.235 0.405 -4.029
#> Defiers -0.226 0.399 -1.009 0.556 -0.272 0.402 -1.060
#> DML.ci.up
#> Always-Takers 2.850
#> Compliers 0.664
#> Never-Takers -2.442
#> Defiers 0.516
# Fit by incorrectly assuming monotonicity
PSor.fit(
out.formula = Y~X1+X2+X3+X4,
ps.formula = D~X1+X2+X3+X4,
pro.formula = Z~X1+X2+X3+X4,
df = df,
out.name = "Y",
int.name = "D",
trt.name = "Z",
cov.names = c("X1","X2","X3","X4"),
or = Inf,
SLmethods = c("SL.glm", "SL.rpart", "SL.nnet"),
n.fold = 5,
scale = "RD",
alpha = 0.05
)
#> CDR.Est CDR.SE CDR.ci.lower CDR.ci.upper DML.Est DML.SE
#> Always-Takers (11) 1.438 0.310 0.831 2.045 1.433 0.317
#> Compliers (01) -35.204 522.863 -1059.997 989.589 46.878 21.272
#> Never-Takers (00) -2.305 0.298 -2.889 -1.722 -2.278 0.315
#> DML.ci.lower DML.ci.upper
#> Always-Takers (11) 0.812 2.055
#> Compliers (01) 5.185 88.571
#> Never-Takers (00) -2.896 -1.660
Here, the function computes estimates under monotonicity by setting or = Inf. The CDR estimator uses linear regression for the continuous outcome and logistic regression for the intermediate outcome and treatment propensity. For the DML estimator, we will use the SuperLearner package for nuisance function estimation, including the outcome regression, principal score, and propensity score. The argument SLmethods = c("SL.glm", "SL.rpart", "SL.nnet") specifies the machine learning algorithms used to estimate the nuisance functions.