## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 6,
  fig.height = 4
)

## ----install, eval=FALSE------------------------------------------------------
# devtools::install_github("queelius/compositional.mle")

## ----load---------------------------------------------------------------------
library(compositional.mle)

## ----quickstart---------------------------------------------------------------
# Generate sample data
set.seed(123)
data <- rnorm(100, mean = 5, sd = 2)

# Define the problem (separate from solver strategy)
problem <- mle_problem(
  loglike = function(theta) {
    if (theta[2] <= 0) return(-Inf)
    sum(dnorm(data, theta[1], theta[2], log = TRUE))
  },
  score = function(theta) {
    mu <- theta[1]; sigma <- theta[2]; n <- length(data)
    c(sum(data - mu) / sigma^2,
      -n / sigma + sum((data - mu)^2) / sigma^3)
  },
  constraint = mle_constraint(
    support = function(theta) theta[2] > 0,
    project = function(theta) c(theta[1], max(theta[2], 1e-6))
  ),
  theta_names = c("mu", "sigma")
)

# Solve with gradient ascent
result <- gradient_ascent()(problem, theta0 = c(0, 1))

cat("Estimated mean:", result$theta.hat[1], "(true: 5)\n")
cat("Estimated sd:", result$theta.hat[2], "(true: 2)\n")

## ----problem-solver-----------------------------------------------------------
# The problem encapsulates the statistical model
print(problem)

# Solvers are independent strategies
solver1 <- gradient_ascent(max_iter = 100)
solver2 <- newton_raphson(max_iter = 50)
solver3 <- bfgs()

# Same problem, different solvers
result1 <- solver1(problem, c(0, 1))
result2 <- solver2(problem, c(0, 1))
result3 <- solver3(problem, c(0, 1))

cat("Gradient ascent:", result1$theta.hat, "\n")
cat("Newton-Raphson:", result2$theta.hat, "\n")
cat("BFGS:", result3$theta.hat, "\n")

## ----sequential---------------------------------------------------------------
# Grid search finds a good region, then gradient ascent refines
strategy <- grid_search(lower = c(-10, 0.5), upper = c(10, 5), n = 5) %>>%
  gradient_ascent(max_iter = 50)

result <- strategy(problem, theta0 = c(0, 1))
cat("Result:", result$theta.hat, "\n")

## ----three-stage--------------------------------------------------------------
# Coarse grid -> gradient ascent -> Newton-Raphson polish
strategy <- grid_search(lower = c(-10, 0.5), upper = c(10, 5), n = 5) %>>%
  gradient_ascent(max_iter = 30) %>>%
  newton_raphson(max_iter = 10)

result <- strategy(problem, theta0 = c(0, 1))
cat("Result:", result$theta.hat, "\n")
cat("Converged:", result$converged, "\n")

## ----race---------------------------------------------------------------------
# Try gradient-based and derivative-free methods
strategy <- gradient_ascent() %|% bfgs() %|% nelder_mead()

result <- strategy(problem, theta0 = c(0, 1))
cat("Winner:", result$solver, "\n")
cat("Result:", result$theta.hat, "\n")

## ----restarts-----------------------------------------------------------------
strategy <- with_restarts(
  gradient_ascent(),
  n = 10,
  sampler = uniform_sampler(c(-10, 0.5), c(10, 5))
)

result <- strategy(problem, theta0 = c(0, 1))
cat("Best restart:", result$best_restart, "of", result$n_restarts, "\n")
cat("Result:", result$theta.hat, "\n")

## ----conditional--------------------------------------------------------------
strategy <- unless_converged(
  gradient_ascent(max_iter = 10),  # Quick attempt

  newton_raphson(max_iter = 50)     # Refine if needed
)

result <- strategy(problem, theta0 = c(0, 1))
cat("Result:", result$theta.hat, "\n")

## ----constraints--------------------------------------------------------------
# Positive parameters
pos_constraint <- mle_constraint(
  support = function(theta) all(theta > 0),
  project = function(theta) pmax(theta, 1e-8)
)

# Box constraints [0, 10]
box_constraint <- mle_constraint(
  support = function(theta) all(theta >= 0 & theta <= 10),
  project = function(theta) pmax(0, pmin(10, theta))
)

# Use in problem definition
problem_constrained <- mle_problem(
  loglike = function(theta) -sum((theta - 5)^2),
  constraint = pos_constraint
)

## ----subsampling, eval=FALSE--------------------------------------------------
# # Original log-likelihood uses all data
# loglike_full <- function(theta, obs = large_data) {
#   sum(dnorm(obs, theta[1], theta[2], log = TRUE))
# }
# 
# # Stochastic version uses random subsets
# loglike_sgd <- with_subsampling(loglike_full, data = large_data, subsample_size = 100)

## ----penalties----------------------------------------------------------------
loglike <- function(theta) -sum(theta^2)

# L1 (Lasso), L2 (Ridge), Elastic Net
loglike_l1 <- with_penalty(loglike, penalty_l1(), lambda = 0.1)
loglike_l2 <- with_penalty(loglike, penalty_l2(), lambda = 0.1)
loglike_enet <- with_penalty(loglike, penalty_elastic_net(alpha = 0.5), lambda = 0.1)

theta <- c(1, 2, 3)
cat("Original:", loglike(theta), "\n")
cat("With L1:", loglike_l1(theta), "\n")
cat("With L2:", loglike_l2(theta), "\n")

## ----trace--------------------------------------------------------------------
trace_config <- mle_trace(values = TRUE, path = TRUE, gradients = TRUE)

result <- gradient_ascent(max_iter = 20)(
  problem,
  theta0 = c(0, 1),
  trace = trace_config
)

if (!is.null(result$trace_data)) {
  cat("Iterations:", result$trace_data$total_iterations, "\n")
  cat("Final log-likelihood:", tail(result$trace_data$values, 1), "\n")
}