params <- list(EVAL = FALSE) ## ----SETTINGS-knitr, include=FALSE-------------------------------------------- stopifnot(require(knitr)) knitr::opts_chunk$set( dev = "png", dpi = 150, fig.asp = 0.618, fig.width = 7, out.width = "90%", fig.align = "center", comment = NA, eval = if (isTRUE(exists("params"))) params$EVAL else FALSE ) ## ----dat_poiss---------------------------------------------------------------- # # Number of observations in the training dataset (= number of observations in # # the test dataset): # N <- 71 # # Data-generating function: # sim_poiss <- function(nobs = 2 * N, ncon = 10, ncats = 4, nnoise = 39) { # # Regression coefficients for continuous predictors: # coefs_con <- rnorm(ncon) # # Continuous predictors: # dat_sim <- matrix(rnorm(nobs * ncon), ncol = ncon) # # Start linear predictor: # linpred <- 2.1 + dat_sim %*% coefs_con # # # Categorical predictor: # dat_sim <- data.frame( # x = dat_sim, # xcat = gl(n = ncats, k = nobs %/% ncats, length = nobs, # labels = paste0("cat", seq_len(ncats))) # ) # # Regression coefficients for the categorical predictor: # coefs_cat <- rnorm(ncats) # # Continue linear predictor: # linpred <- linpred + coefs_cat[dat_sim$xcat] # # # Noise predictors: # dat_sim <- data.frame( # dat_sim, # xn = matrix(rnorm(nobs * nnoise), ncol = nnoise) # ) # # # Poisson response, using the log link (i.e., exp() as inverse link): # dat_sim$y <- rpois(nobs, lambda = exp(linpred)) # # Shuffle order of observations: # dat_sim <- dat_sim[sample.int(nobs), , drop = FALSE] # # Drop the shuffled original row names: # rownames(dat_sim) <- NULL # return(dat_sim) # } # # Generate data: # set.seed(300417) # dat_poiss <- sim_poiss() # dat_poiss_train <- head(dat_poiss, N) # dat_poiss_test <- tail(dat_poiss, N) ## ----rstanarm_attach, message=FALSE------------------------------------------- # library(rstanarm) ## ----ref_fit_poiss------------------------------------------------------------ # # Number of regression coefficients: # ( D <- sum(grepl("^x", names(dat_poiss_train))) ) # # Prior guess for the number of relevant (i.e., non-zero) regression # # coefficients: # p0 <- 10 # # Prior guess for the overall magnitude of the response values, see Table 1 of # # Piironen and Vehtari (2017, DOI: 10.1214/17-EJS1337SI): # mu_prior <- 100 # # Hyperprior scale for tau, the global shrinkage parameter: # tau0 <- p0 / (D - p0) / sqrt(mu_prior) / sqrt(N) # # Set this manually if desired: # ncores <- parallel::detectCores(logical = FALSE) # ### Only for technical reasons in this vignette (you can omit this when running # ### the code yourself): # ncores <- min(ncores, 2L) # ### # options(mc.cores = ncores) # refm_fml <- as.formula(paste("y", "~", paste( # grep("^x", names(dat_poiss_train), value = TRUE), # collapse = " + " # ))) # refm_fit_poiss <- stan_glm( # formula = refm_fml, # family = poisson(), # data = dat_poiss_train, # prior = hs(global_scale = tau0, slab_df = 100, slab_scale = 1), # ### Only for the sake of speed (not recommended in general): # chains = 2, iter = 1000, # ### # refresh = 0 # ) ## ----projpred_attach, message=FALSE------------------------------------------- # library(projpred) ## ----vs_lat------------------------------------------------------------------- # d_test_lat_poiss <- list( # data = dat_poiss_test, # offset = rep(0, nrow(dat_poiss_test)), # weights = rep(1, nrow(dat_poiss_test)), # ### Here, we are not interested in latent-scale post-processing, so we can set # ### element `y` to a vector of `NA`s: # y = rep(NA, nrow(dat_poiss_test)), # ### # y_oscale = dat_poiss_test$y # ) # refm_poiss <- get_refmodel(refm_fit_poiss, latent = TRUE) # time_lat <- system.time(vs_lat <- varsel( # refm_poiss, # d_test = d_test_lat_poiss, # ### Only for demonstrating an issue with the traditional projection in the # ### next step (not recommended in general): # method = "L1", # ### # ### Only for the sake of speed (not recommended in general): # nclusters_pred = 20, # ### # nterms_max = 14, # ### In interactive use, we recommend not to deactivate the verbose mode: # verbose = 0, # ### # ### For comparability with varsel() based on the traditional projection: # seed = 95930 # ### # )) ## ----time_lat----------------------------------------------------------------- # print(time_lat) ## ----plot_vsel_lat------------------------------------------------------------ # options(projpred.plot_vsel_size_position = "secondary_x") # ( gg_lat <- plot(vs_lat, stats = "gmpd", deltas = TRUE) ) ## ----size_man_lat------------------------------------------------------------- # size_decided_lat <- 11 ## ----size_sgg_lat------------------------------------------------------------- # suggest_size(vs_lat, stat = "gmpd") ## ----predictors_final_lat----------------------------------------------------- # rk_lat <- ranking(vs_lat) # ( predictors_final_lat <- head(rk_lat[["fulldata"]], size_decided_lat) ) ## ----suppress_warn_poiss, include=FALSE--------------------------------------- # warn_instable_orig <- options(projpred.warn_instable_projections = FALSE) ## ----vs_trad------------------------------------------------------------------ # d_test_trad_poiss <- d_test_lat_poiss # d_test_trad_poiss$y <- d_test_trad_poiss$y_oscale # d_test_trad_poiss$y_oscale <- NULL # time_trad <- system.time(vs_trad <- varsel( # refm_fit_poiss, # d_test = d_test_trad_poiss, # ### Only for demonstrating an issue with the traditional projection (not # ### recommended in general): # method = "L1", # ### # ### Only for the sake of speed (not recommended in general): # nclusters_pred = 20, # ### # nterms_max = 30, # ### In interactive use, we recommend not to deactivate the verbose mode: # verbose = 0, # ### # ### For comparability with varsel() based on the latent projection: # seed = 95930 # ### # )) ## ----unsuppress_warn_poiss, include=FALSE------------------------------------- # options(warn_instable_orig) # rm(warn_instable_orig) ## ----post_vs_trad------------------------------------------------------------- # print(time_trad) # ( gg_trad <- plot(vs_trad, stats = "gmpd", deltas = TRUE) ) ## ----ref_fit_nebin------------------------------------------------------------ # refm_fit_nebin <- stan_glm( # formula = refm_fml, # family = neg_binomial_2(), # data = dat_poiss_train, # prior = hs(global_scale = tau0, slab_df = 100, slab_scale = 1), # ### Only for the sake of speed (not recommended in general): # chains = 2, iter = 1000, # ### # refresh = 0 # ) ## ----vs_nebin----------------------------------------------------------------- # refm_prec <- as.matrix(refm_fit_nebin)[, "reciprocal_dispersion", drop = FALSE] # latent_ll_oscale_nebin <- function(ilpreds, # dis = rep(NA, nrow(ilpreds)), # y_oscale, # wobs = rep(1, length(y_oscale)), # cl_ref, # wdraws_ref = rep(1, length(cl_ref))) { # y_oscale_mat <- matrix(y_oscale, nrow = nrow(ilpreds), ncol = ncol(ilpreds), # byrow = TRUE) # wobs_mat <- matrix(wobs, nrow = nrow(ilpreds), ncol = ncol(ilpreds), # byrow = TRUE) # refm_prec_agg <- cl_agg(refm_prec, cl = cl_ref, wdraws = wdraws_ref) # ll_unw <- dnbinom(y_oscale_mat, size = refm_prec_agg, mu = ilpreds, log = TRUE) # return(wobs_mat * ll_unw) # } # latent_ppd_oscale_nebin <- function(ilpreds_resamp, # dis_resamp = rep(NA, nrow(ilpreds_resamp)), # wobs, # cl_ref, # wdraws_ref = rep(1, length(cl_ref)), # idxs_prjdraws) { # refm_prec_agg <- cl_agg(refm_prec, cl = cl_ref, wdraws = wdraws_ref) # refm_prec_agg_resamp <- refm_prec_agg[idxs_prjdraws, , drop = FALSE] # ppd <- rnbinom(prod(dim(ilpreds_resamp)), size = refm_prec_agg_resamp, # mu = ilpreds_resamp) # ppd <- matrix(ppd, nrow = nrow(ilpreds_resamp), ncol = ncol(ilpreds_resamp)) # return(ppd) # } # refm_nebin <- get_refmodel(refm_fit_nebin, latent = TRUE, # latent_ll_oscale = latent_ll_oscale_nebin, # latent_ppd_oscale = latent_ppd_oscale_nebin) # vs_nebin <- varsel( # refm_nebin, # d_test = d_test_lat_poiss, # ### Only for the sake of speed (not recommended in general): # method = "L1", # nclusters_pred = 20, # ### # nterms_max = 14, # ### In interactive use, we recommend not to deactivate the verbose mode: # verbose = 0 # ### # ) ## ----plot_vsel_nebin---------------------------------------------------------- # ( gg_nebin <- plot(vs_nebin, stats = "gmpd", deltas = TRUE) ) ## ----size_man_nebin----------------------------------------------------------- # size_decided_nebin <- 11 ## ----size_sgg_nebin----------------------------------------------------------- # suggest_size(vs_nebin, stat = "gmpd") ## ----predictors_final_nebin--------------------------------------------------- # rk_nebin <- ranking(vs_nebin) # ( predictors_final_nebin <- head(rk_nebin[["fulldata"]], # size_decided_nebin) )