## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----setup-------------------------------------------------------------------- library(xplainfi) library(DiagrammeR) library(mlr3learners) set.seed(123) ## ----overview-table, echo=FALSE----------------------------------------------- dgp_overview <- data.frame( DGP = c( "sim_dgp_correlated", "sim_dgp_mediated", "sim_dgp_confounded", "sim_dgp_interactions", "sim_dgp_independent", "sim_dgp_ewald" ), Challenge = c( "Spurious correlation", "Mediation effects", "Confounding", "Interaction effects", "Baseline (no challenges)", "Mixed effects" ), `PFI Behavior` = c( "High for spurious x2", "Shows total effects", "Biased upward", "Low (no main effects)", "Accurate", "Mixed" ), `CFI Behavior` = c( "Low for spurious x2", "Shows direct effects", "Less biased", "High (captures interactions)", "Accurate", "Mixed" ), check.names = FALSE ) knitr::kable(dgp_overview, caption = "Overview of simulation settings and expected method behavior") ## ----dag-correlated, echo=FALSE, fig.cap="DAG for correlated features DGP", fig.width=10, fig.height=4---- grViz( " digraph Correlated { rankdir=LR; graph [ranksep=1.5]; node [shape=circle, style=filled, fontsize=14, width=1.2]; X1 [fillcolor='lightcoral', label='X₁\n(β=2.0)']; X2 [fillcolor='pink', label='X₂\n(β=0)']; X3 [fillcolor='lightblue', label='X₃\n(β=1.0)']; X4 [fillcolor='lightgray', label='X₄\n(β=0)']; Y [fillcolor='greenyellow', label='Y', width=1.5]; X1 -> X2 [color=red, style=bold, label='r≈0.9']; X1 -> Y [label='2.0']; X2 -> Y [style=dashed, color=gray, label='0']; X3 -> Y [label='1.0']; X4 -> Y [style=dashed, color=gray]; {rank=source; X1; X3; X4} {rank=same; X2} {rank=sink; Y} }" ) ## ----correlated-example------------------------------------------------------- set.seed(123) task <- sim_dgp_correlated(n = 500) # Check correlation between X1 and X2 cor(task$data()[, c("x1", "x2")]) # True coefficients: x1=2.0, x2=0, x3=1.0, x4=0 # Note: x2 is highly correlated with x1 but has NO causal effect! ## ----dag-mediated, echo=FALSE, fig.cap="DAG for mediated effects DGP", fig.width=10, fig.height=4---- grViz( " digraph Mediated { rankdir=LR; graph [ranksep=1.2]; node [shape=circle, style=filled, fontsize=14, width=1.2]; E [fillcolor='orange', label='Exposure\n(β=0)']; D [fillcolor='lightblue', label='Direct\n(β=0.5)']; M [fillcolor='yellow', label='Mediator\n(β=1.5)']; N [fillcolor='lightgray', label='Noise\n(β=0)']; Y [fillcolor='greenyellow', label='Y', width=1.5]; E -> M [label='0.8', color=purple, penwidth=2]; D -> M [label='0.6', color=blue]; D -> Y [label='0.5', color=blue]; M -> Y [label='1.5', color=purple, penwidth=2]; N -> Y [style=dashed, color=gray]; {rank=source; E; D; N} {rank=same; M} {rank=sink; Y} }" ) ## ----mediated-example--------------------------------------------------------- set.seed(123) task <- sim_dgp_mediated(n = 500) # Calculate total effect of exposure # Total effect = 0.8 * 1.5 = 1.2 (through mediator) # Direct effect = 0 (no direct path to Y) ## ----dag-confounded, echo=FALSE, fig.cap="DAG for confounding DGP", fig.width=10, fig.height=5---- grViz( " digraph Confounded { rankdir=LR; graph [ranksep=1.2, nodesep=0.8]; node [shape=circle, style=filled, fontsize=14, width=1.2]; H [fillcolor='red', label='H\n(Confounder)', style='filled,dashed']; X1 [fillcolor='lightcoral', label='X₁\n(β=1.0)']; P [fillcolor='pink', label='Proxy\n(β=0)']; I [fillcolor='lightblue', label='Independent\n(β=1.0)']; Y [fillcolor='greenyellow', label='Y', width=1.5]; H -> X1 [color=red, label='1.0']; H -> P [color=red, style=dashed, label='1.0']; H -> Y [color=red, label='1.0', penwidth=2]; X1 -> Y [label='1.0']; I -> Y [label='1.0']; {rank=source; H} {rank=same; X1; P; I} {rank=sink; Y} }" ) ## ----confounded-example------------------------------------------------------- set.seed(123) # Hidden confounder scenario (default) task_hidden <- sim_dgp_confounded(n = 500, hidden = TRUE) task_hidden$feature_names # proxy available but not confounder # Observable confounder scenario task_observed <- sim_dgp_confounded(n = 500, hidden = FALSE) task_observed$feature_names # both confounder and proxy available ## ----dag-interactions, echo=FALSE, fig.cap="DAG for interaction effects DGP", fig.width=10, fig.height=4---- grViz( " digraph Interaction { rankdir=LR; graph [ranksep=1.2]; node [shape=circle, style=filled, fontsize=14, width=1.2]; X1 [fillcolor='orange', label='X₁\n(β=0)']; X2 [fillcolor='orange', label='X₂\n(β=0)']; X3 [fillcolor='lightblue', label='X₃\n(β=1.0)']; N1 [fillcolor='lightgray', label='N₁\n(β=0)']; N2 [fillcolor='lightgray', label='N₂\n(β=0)']; Y [fillcolor='greenyellow', label='Y', width=1.5]; INT [fillcolor='red', shape=diamond, label='X₁×X₂\n(β=2.0)', width=1.5]; X1 -> INT [color=red, penwidth=2]; X2 -> INT [color=red, penwidth=2]; INT -> Y [color=red, label='2.0', penwidth=2]; X3 -> Y [label='1.0']; N1 -> Y [style=dashed, color=gray]; N2 -> Y [style=dashed, color=gray]; {rank=source; X1; X2; X3; N1; N2} {rank=same; INT} {rank=sink; Y} }" ) ## ----interactions-example----------------------------------------------------- set.seed(123) task <- sim_dgp_interactions(n = 500) # Note: X1 and X2 have NO main effects # Their importance comes ONLY through their interaction ## ----dag-independent, echo=FALSE, fig.cap="DAG for independent features DGP", fig.width=10, fig.height=4---- grViz( " digraph Independent { rankdir=LR; graph [ranksep=1.5]; node [shape=circle, style=filled, fontsize=14, width=1.2]; X1 [fillcolor='lightblue', label='X₁\n(β=2.0)']; X2 [fillcolor='lightblue', label='X₂\n(β=1.0)']; X3 [fillcolor='lightblue', label='X₃\n(β=0.5)']; N1 [fillcolor='lightgray', label='N₁\n(β=0)']; N2 [fillcolor='lightgray', label='N₂\n(β=0)']; Y [fillcolor='greenyellow', label='Y', width=1.5]; X1 -> Y [label='2.0', penwidth=3]; X2 -> Y [label='1.0', penwidth=2]; X3 -> Y [label='0.5']; N1 -> Y [style=dashed, color=gray]; N2 -> Y [style=dashed, color=gray]; {rank=source; X1; X2; X3; N1; N2} {rank=sink; Y} }" ) ## ----independent-example------------------------------------------------------ set.seed(123) task <- sim_dgp_independent(n = 500) # All methods should rank features consistently: # important1 > important2 > important3 > unimportant1,2 (approx. 0) ## ----dag-ewald, echo=FALSE, fig.cap="DAG for Ewald et al. (2024) DGP", fig.width=10, fig.height=4---- grViz( " digraph Ewald { rankdir=LR; graph [ranksep=1.2]; node [shape=circle, style=filled, fontsize=14, width=1.2]; X1 [fillcolor='lightgray', label='X₁\n(β=0)']; X2 [fillcolor='lightgray', label='X₂\n(β=0)']; X3 [fillcolor='lightgray', label='X₃\n(β=0)']; X4 [fillcolor='lightblue', label='X₄\n(β=1.0)']; X5 [fillcolor='lightblue', label='X₅\n(β=1.0)']; Y [fillcolor='greenyellow', label='Y', width=1.5]; INT [fillcolor='red', shape=diamond, label='X₄×X₅\n(β=1.0)', width=1.5]; X1 -> X2 [color=gray, label='≈1.0']; X3 -> X4 [color=gray, label='≈1.0']; X4 -> Y [label='1.0']; X5 -> Y [label='1.0']; X4 -> INT [color=red]; X5 -> INT [color=red]; INT -> Y [color=red, label='1.0']; {rank=source; X1; X3; X5} {rank=same; X2; X4} {rank=same; INT} {rank=sink; Y} }" ) ## ----ewald-example------------------------------------------------------------ sim_dgp_ewald(n = 500)