## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 6, fig.height = 4 ) ## ----install, eval=FALSE------------------------------------------------------ # devtools::install_github("queelius/compositional.mle") ## ----load--------------------------------------------------------------------- library(compositional.mle) ## ----quickstart--------------------------------------------------------------- # Generate sample data set.seed(123) data <- rnorm(100, mean = 5, sd = 2) # Define the problem (separate from solver strategy) problem <- mle_problem( loglike = function(theta) { if (theta[2] <= 0) return(-Inf) sum(dnorm(data, theta[1], theta[2], log = TRUE)) }, score = function(theta) { mu <- theta[1]; sigma <- theta[2]; n <- length(data) c(sum(data - mu) / sigma^2, -n / sigma + sum((data - mu)^2) / sigma^3) }, constraint = mle_constraint( support = function(theta) theta[2] > 0, project = function(theta) c(theta[1], max(theta[2], 1e-6)) ), theta_names = c("mu", "sigma") ) # Solve with gradient ascent result <- gradient_ascent()(problem, theta0 = c(0, 1)) cat("Estimated mean:", result$theta.hat[1], "(true: 5)\n") cat("Estimated sd:", result$theta.hat[2], "(true: 2)\n") ## ----problem-solver----------------------------------------------------------- # The problem encapsulates the statistical model print(problem) # Solvers are independent strategies solver1 <- gradient_ascent(max_iter = 100) solver2 <- newton_raphson(max_iter = 50) solver3 <- bfgs() # Same problem, different solvers result1 <- solver1(problem, c(0, 1)) result2 <- solver2(problem, c(0, 1)) result3 <- solver3(problem, c(0, 1)) cat("Gradient ascent:", result1$theta.hat, "\n") cat("Newton-Raphson:", result2$theta.hat, "\n") cat("BFGS:", result3$theta.hat, "\n") ## ----sequential--------------------------------------------------------------- # Grid search finds a good region, then gradient ascent refines strategy <- grid_search(lower = c(-10, 0.5), upper = c(10, 5), n = 5) %>>% gradient_ascent(max_iter = 50) result <- strategy(problem, theta0 = c(0, 1)) cat("Result:", result$theta.hat, "\n") ## ----three-stage-------------------------------------------------------------- # Coarse grid -> gradient ascent -> Newton-Raphson polish strategy <- grid_search(lower = c(-10, 0.5), upper = c(10, 5), n = 5) %>>% gradient_ascent(max_iter = 30) %>>% newton_raphson(max_iter = 10) result <- strategy(problem, theta0 = c(0, 1)) cat("Result:", result$theta.hat, "\n") cat("Converged:", result$converged, "\n") ## ----race--------------------------------------------------------------------- # Try gradient-based and derivative-free methods strategy <- gradient_ascent() %|% bfgs() %|% nelder_mead() result <- strategy(problem, theta0 = c(0, 1)) cat("Winner:", result$solver, "\n") cat("Result:", result$theta.hat, "\n") ## ----restarts----------------------------------------------------------------- strategy <- with_restarts( gradient_ascent(), n = 10, sampler = uniform_sampler(c(-10, 0.5), c(10, 5)) ) result <- strategy(problem, theta0 = c(0, 1)) cat("Best restart:", result$best_restart, "of", result$n_restarts, "\n") cat("Result:", result$theta.hat, "\n") ## ----conditional-------------------------------------------------------------- strategy <- unless_converged( gradient_ascent(max_iter = 10), # Quick attempt newton_raphson(max_iter = 50) # Refine if needed ) result <- strategy(problem, theta0 = c(0, 1)) cat("Result:", result$theta.hat, "\n") ## ----constraints-------------------------------------------------------------- # Positive parameters pos_constraint <- mle_constraint( support = function(theta) all(theta > 0), project = function(theta) pmax(theta, 1e-8) ) # Box constraints [0, 10] box_constraint <- mle_constraint( support = function(theta) all(theta >= 0 & theta <= 10), project = function(theta) pmax(0, pmin(10, theta)) ) # Use in problem definition problem_constrained <- mle_problem( loglike = function(theta) -sum((theta - 5)^2), constraint = pos_constraint ) ## ----subsampling, eval=FALSE-------------------------------------------------- # # Original log-likelihood uses all data # loglike_full <- function(theta, obs = large_data) { # sum(dnorm(obs, theta[1], theta[2], log = TRUE)) # } # # # Stochastic version uses random subsets # loglike_sgd <- with_subsampling(loglike_full, data = large_data, subsample_size = 100) ## ----penalties---------------------------------------------------------------- loglike <- function(theta) -sum(theta^2) # L1 (Lasso), L2 (Ridge), Elastic Net loglike_l1 <- with_penalty(loglike, penalty_l1(), lambda = 0.1) loglike_l2 <- with_penalty(loglike, penalty_l2(), lambda = 0.1) loglike_enet <- with_penalty(loglike, penalty_elastic_net(alpha = 0.5), lambda = 0.1) theta <- c(1, 2, 3) cat("Original:", loglike(theta), "\n") cat("With L1:", loglike_l1(theta), "\n") cat("With L2:", loglike_l2(theta), "\n") ## ----trace-------------------------------------------------------------------- trace_config <- mle_trace(values = TRUE, path = TRUE, gradients = TRUE) result <- gradient_ascent(max_iter = 20)( problem, theta0 = c(0, 1), trace = trace_config ) if (!is.null(result$trace_data)) { cat("Iterations:", result$trace_data$total_iterations, "\n") cat("Final log-likelihood:", tail(result$trace_data$values, 1), "\n") }