# =============================================================================
# TEMA 2 — ELECCIÓN DISCRETA (LOGIT Y PROBIT): SCRIPT INTERACTIVO
# =============================================================================
# Autor: Carlos de Anta Puig
#        Economista · Perito Financiero
#        Miembro del Colegio de Economistas de Madrid
#        Miembro del Instituto Español de Analistas Financieros (IEAF)
#        Profesor de Econometría y Microeconometría
#        carlos@cwconsultores.com
#
# Manual de Microeconometría - Digital Reasons (2026)
# https://github.com/carlanta/MicroEconometrics
#
# Versión: 1.0
# Fecha:   2026-03-27
#
# Ejecutar bloque a bloque en RStudio.
# =============================================================================

#
# Autor:  Carlos de Anta Puig
#         Economista - Perito Financiero
#         Miembro del Colegio de Economistas de Madrid
#         Miembro del Instituto Espanol de Analistas Financieros (IEAF)
#         Profesor de Econometría y Microeconometría
#         carlosmaria.deanta@unir.net
#
# Version: 1.0 | Fecha: Marzo 2026
#
# Descripcion:
#   Script pedagogico completo del Tema 2. Cubre:
#   - Generacion del dataset simulado de participacion laboral
#   - Modelo Lineal de Probabilidad (MLP) y sus limitaciones
#   - Modelo Probit: estimacion, efectos marginales, bondad de ajuste
#   - Modelo Logit: odds ratios, efectos marginales, comparacion
#   - Comparacion completa MLP vs Probit vs Logit
#
# Uso:
#   Ejecutar el script completo en RStudio. Se detiene entre secciones
#   con readline() para que el alumno pueda leer los resultados.
# ============================================================================


# ============================================================================
# CARGA DE LIBRERIAS
# ============================================================================
suppressPackageStartupMessages({
  library(lmtest)    # test de heterocedasticidad (bptest)
  library(sandwich)  # errores estandar robustos
  library(mfx)       # efectos marginales: probitmfx(), logitmfx()
  library(aod)       # test de Wald: wald.test()
  library(DT)        # tablas interactivas
})

# Funcion auxiliar para pausas interactivas
pausa <- function() invisible(readline(prompt = "Pulsa ENTER para continuar..."))

# Funcion auxiliar para tablas interactivas DT
tabla_dt <- function(datos, titulo = "", decimales = 2, filas = TRUE) {
  if (is.numeric(datos) && !is.null(names(datos))) {
    datos <- data.frame(Nombre = names(datos), Valor = round(datos, decimales))
    filas <- FALSE
  }
  cat("[Abriendo tabla en el Viewer...]\n")
  print(DT::datatable(
    datos,
    caption  = titulo,
    extensions = 'Buttons',
    options  = list(
      dom        = 'Bfrtip',
      buttons    = c('copy', 'csv', 'excel'),
      pageLength = 15,
      scrollX    = TRUE
    ),
    rownames = filas
  ))
}

# Funcion auxiliar para transparencias en graficos
alpha_color <- function(col, alpha) {
  rgb_vals <- col2rgb(col) / 255
  rgb(rgb_vals[1], rgb_vals[2], rgb_vals[3], alpha = alpha)
}


# ============================================================================
# SECCION 1: GENERACION DEL DATASET
# ============================================================================
# Generamos un dataset simulado de participacion laboral en Espana.
# Variable dependiente: trabaja (1 = participa, 0 = no participa)
# El proceso generador de datos (DGP) usa una funcion logistica.
# ============================================================================

set.seed(2024)
N <- 500

# --- Variables explicativas ---
educacion       <- round(runif(N, min = 6, max = 20))
edad            <- round(runif(N, min = 18, max = 65))
sexo            <- rbinom(N, size = 1, prob = 0.50)
experiencia     <- pmax(0, round(edad - educacion - 6 + rnorm(N, mean = 0, sd = 3)))
hijos           <- sample(0:4, N, replace = TRUE, prob = c(0.35, 0.30, 0.20, 0.10, 0.05))
ingreso_familiar <- round(pmax(0, rnorm(N, mean = 10, sd = 6)), 1)

# --- Indice latente y probabilidad (DGP verdadero) ---
b0 <- -2.00; b1 <- 0.20; b2 <- 0.03; b3 <- -0.60
b4 <-  0.05; b5 <- -0.35; b6 <- -0.08

z <- b0 + b1*educacion + b2*edad + b3*sexo +
     b4*experiencia + b5*hijos + b6*ingreso_familiar

prob_trabaja <- 1 / (1 + exp(-z))
trabaja <- rbinom(N, size = 1, prob = prob_trabaja)

# --- Construccion del data frame ---
datos <- data.frame(
  id               = 1:N,
  trabaja          = trabaja,
  educacion        = educacion,
  edad             = edad,
  sexo             = factor(sexo, levels = c(0, 1), labels = c("Hombre", "Mujer")),
  experiencia      = experiencia,
  hijos            = hijos,
  ingreso_familiar = ingreso_familiar
)

# --- Exploracion basica ---
cat("=== DATASET GENERADO ===\n")
cat("Observaciones:", N, "\n")
cat("Proporcion que trabaja:", round(mean(datos$trabaja), 3), "\n\n")
print(summary(datos[, -1]))

cat("\nTasa de participacion por sexo:\n")
print(tapply(datos$trabaja, datos$sexo, mean))

# Tabla interactiva del dataset
tabla_dt(datos, titulo = "Dataset: participacion laboral simulada")

# --- Grafico exploratorio ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))

barplot(table(datos$trabaja),
        names.arg = c("No trabaja (0)", "Trabaja (1)"),
        col = c("#E74C3C", "#2ECC71"),
        main = "Distribucion de 'trabaja'",
        ylab = "Frecuencia", xlab = "")

boxplot(educacion ~ trabaja, data = datos,
        names = c("No trabaja", "Trabaja"),
        col = c("#E74C3C", "#2ECC71"),
        main = "Educacion segun participacion",
        ylab = "Anos de educacion")

barplot(tapply(datos$trabaja, datos$sexo, mean),
        col = c("#3498DB", "#E91E8C"),
        main = "Tasa de participacion por sexo",
        ylab = "Proporcion que trabaja", ylim = c(0, 1))
abline(h = mean(datos$trabaja), lty = 2, col = "gray50")

hist(prob_trabaja, breaks = 20, col = "#9B59B6",
     main = "Distribucion de P(trabaja=1) verdadera",
     xlab = "Probabilidad", ylab = "Frecuencia")
abline(v = 0.5, lty = 2, col = "red")

par(mfrow = c(1, 1))

# Para que se pare
pausa()


# ============================================================================
# SECCION 2: MODELO LINEAL DE PROBABILIDAD (MLP)
# ============================================================================
# P(trabaja = 1 | x) = b0 + b1*educacion + ... (estimacion por MCO)
# Demostramos sus tres limitaciones principales.
# ============================================================================

mlp_modelo <- lm(trabaja ~ educacion + edad + sexo +
                   experiencia + hijos + ingreso_familiar,
                 data = datos)

cat("=== RESULTADOS DEL MLP (MCO) ===\n")
print(summary(mlp_modelo))

# --- Tabla de coeficientes ---
coefs <- summary(mlp_modelo)$coefficients
tabla_mlp <- data.frame(
  Variable    = rownames(coefs),
  Coeficiente = round(coefs[, 1], 4),
  Std_Error   = round(coefs[, 2], 4),
  t_valor     = round(coefs[, 3], 3),
  p_valor     = round(coefs[, 4], 4),
  Significativo = ifelse(coefs[, 4] < 0.001, "***",
                  ifelse(coefs[, 4] < 0.01,  "**",
                  ifelse(coefs[, 4] < 0.05,  "*",
                  ifelse(coefs[, 4] < 0.10,  ".",  ""))))
)
rownames(tabla_mlp) <- NULL
tabla_dt(tabla_mlp, titulo = "Coeficientes del MLP", filas = FALSE)

# --- Interpretacion ---
cat("\n=== INTERPRETACION (MLP: coefs = efectos marginales) ===\n")
b <- coef(mlp_modelo)
cat(sprintf("educacion      : +1 ano -> +%.1f pp en P(trabaja)\n", b["educacion"]*100))
cat(sprintf("sexo (Mujer)   : ser mujer -> %.1f pp\n", b["sexoMujer"]*100))
cat(sprintf("hijos          : +1 hijo  -> %.1f pp\n", b["hijos"]*100))
cat(sprintf("ingreso_fam    : +1.000 EUR -> %.1f pp\n", b["ingreso_familiar"]*100))

# --- Limitacion 1: predicciones fuera de [0,1] ---
datos$prob_mlp <- predict(mlp_modelo, type = "response")
n_fuera <- sum(datos$prob_mlp < 0 | datos$prob_mlp > 1)
cat(sprintf("\nPredicciones fuera de [0,1]: %d de %d (%.1f%%)\n",
            n_fuera, nrow(datos), 100 * n_fuera / nrow(datos)))

# --- Limitacion 2: heterocedasticidad ---
cat("\n=== TEST DE HETEROCEDASTICIDAD (Breusch-Pagan) ===\n")
bp_test <- bptest(mlp_modelo)
print(bp_test)
if (bp_test$p.value < 0.05) {
  message("-> Rechazamos H0: HAY heterocedasticidad (p < 0.05)")
} else {
  message("-> No se rechaza H0: sin evidencia de heterocedasticidad")
}

# --- Errores robustos ---
cat("\n=== COEFICIENTES CON ERRORES ROBUSTOS (HC1) ===\n")
print(coeftest(mlp_modelo, vcov = vcovHC(mlp_modelo, type = "HC1")))

# --- Graficos MLP ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))

hist(datos$prob_mlp, breaks = 30, col = "#3498DB", border = "white",
     main = "MLP: Distribucion de probabilidades predichas",
     xlab = "P(trabaja = 1)", ylab = "Frecuencia")
abline(v = c(0, 1), col = "red", lty = 2, lwd = 2)

mlp_simple <- lm(trabaja ~ educacion, data = datos)
plot(datos$educacion + runif(nrow(datos), -0.3, 0.3),
     datos$trabaja,
     pch = 16, col = alpha_color("#2ECC71", 0.3),
     main = "MLP: Recta ajustada (trabaja ~ educacion)",
     xlab = "Anos de educacion", ylab = "P(trabaja = 1)")
abline(mlp_simple, col = "#E74C3C", lwd = 2)
abline(h = c(0, 1), col = "gray50", lty = 2)

plot(mlp_modelo$fitted.values, mlp_modelo$residuals,
     pch = 16, col = alpha_color("#9B59B6", 0.4),
     main = "MLP: Residuos vs Valores Ajustados",
     xlab = "Valores ajustados", ylab = "Residuos")
abline(h = 0, col = "red", lty = 2, lwd = 2)

edu_seq <- seq(6, 20, length.out = 100)
pred_mlp_line <- coef(mlp_simple)[1] + coef(mlp_simple)[2] * edu_seq
logit_simple <- glm(trabaja ~ educacion, data = datos, family = binomial)
pred_logit_line <- predict(logit_simple,
                            newdata = data.frame(educacion = edu_seq),
                            type = "response")
plot(edu_seq, pred_mlp_line,
     type = "l", col = "#E74C3C", lwd = 2, ylim = c(-0.1, 1.1),
     main = "MLP vs Logit: forma funcional",
     xlab = "Anos de educacion", ylab = "P(trabaja = 1)")
lines(edu_seq, pred_logit_line, col = "#2980B9", lwd = 2, lty = 2)
abline(h = c(0, 1), col = "gray70", lty = 3)
legend("topleft", legend = c("MLP (lineal)", "Logit (no lineal)"),
       col = c("#E74C3C", "#2980B9"), lty = c(1, 2), lwd = 2, bty = "n")

par(mfrow = c(1, 1))

cat("\n=== RESUMEN: LIMITACIONES DEL MLP ===\n")
cat("1. Probabilidades fuera de [0,1]\n")
cat("2. Heterocedasticidad del error\n")
cat("3. Efecto marginal constante (no realista)\n")
cat("-> Solucion: Probit o Logit\n")

pausa()


# ============================================================================
# SECCION 3: MODELO PROBIT
# ============================================================================
# P(trabaja = 1 | x) = Phi(x'beta)
# Phi = CDF de la Normal Estandar. Estimacion por Maxima Verosimilitud.
# ============================================================================

probit_modelo <- glm(trabaja ~ educacion + edad + sexo +
                       experiencia + hijos + ingreso_familiar,
                     data = datos, family = binomial(link = "probit"))

cat("=== RESULTADOS DEL MODELO PROBIT ===\n")
print(summary(probit_modelo))

# --- Tabla de coeficientes ---
coefs_p <- summary(probit_modelo)$coefficients
tabla_probit <- data.frame(
  Variable      = rownames(coefs_p),
  Coeficiente   = round(coefs_p[, 1], 4),
  Std_Error     = round(coefs_p[, 2], 4),
  z_valor       = round(coefs_p[, 3], 3),
  p_valor       = round(coefs_p[, 4], 4),
  Significativo = ifelse(coefs_p[, 4] < 0.001, "***",
                  ifelse(coefs_p[, 4] < 0.01,  "**",
                  ifelse(coefs_p[, 4] < 0.05,  "*",
                  ifelse(coefs_p[, 4] < 0.10,  ".",  ""))))
)
rownames(tabla_probit) <- NULL
tabla_dt(tabla_probit, titulo = "Coeficientes del modelo Probit", filas = FALSE)

# --- Interpretacion de coeficientes (solo signo) ---
cat("\nATENCION: En Probit los coeficientes NO son efectos marginales.\n")
cat("Solo informan la DIRECCION del efecto:\n\n")
b <- coef(probit_modelo)
for (nm in names(b)[-1]) {
  signo <- ifelse(b[nm] > 0, "AUMENTA (+)", "DISMINUYE (-)")
  cat(sprintf("  %-20s -> P(trabaja=1) %s\n", nm, signo))
}

pausa()

# --- Efectos marginales ---
cat("\n=== EFECTOS MARGINALES (en la media muestral) ===\n")
probit_mfx <- probitmfx(trabaja ~ educacion + edad + sexo +
                          experiencia + hijos + ingreso_familiar,
                        data = datos, atmean = TRUE)
print(probit_mfx)

cat("\n=== EFECTOS MARGINALES (promedio - AME) ===\n")
probit_mfx_avg <- probitmfx(trabaja ~ educacion + edad + sexo +
                               experiencia + hijos + ingreso_familiar,
                             data = datos, atmean = FALSE)
print(probit_mfx_avg)

# --- Interpretacion de efectos marginales ---
cat("\n=== INTERPRETACION DE EFECTOS MARGINALES ===\n")
em <- probit_mfx$mfxest
cat(sprintf("educacion   : +1 ano -> +%.2f pp en P(trabaja)\n", em["educacion","dF/dx"]*100))
cat(sprintf("sexo (Mujer): ser mujer -> %.2f pp\n", em["sexoMujer","dF/dx"]*100))
cat(sprintf("hijos       : +1 hijo -> %.2f pp\n", em["hijos","dF/dx"]*100))

pausa()

# --- Bondad de ajuste ---
cat("\n=== BONDAD DE AJUSTE ===\n")
pseudoR2_mcfadden <- 1 - (probit_modelo$deviance / probit_modelo$null.deviance)
cat(sprintf("Pseudo-R2 de McFadden: %.4f\n", pseudoR2_mcfadden))
cat("  (Valores entre 0.2 y 0.4 = buen ajuste en modelos binarios)\n\n")
cat(sprintf("AIC: %.2f\n", AIC(probit_modelo)))
cat(sprintf("BIC: %.2f\n\n", BIC(probit_modelo)))

# --- Contrastes ---
cat("=== TEST DE WALD (significacion conjunta) ===\n")
cat("H0: todos los coeficientes (excepto constante) = 0\n\n")
wald_res <- wald.test(b = coef(probit_modelo), Sigma = vcov(probit_modelo), Terms = 2:7)
print(wald_res)

cat("\n=== TEST DE RATIO DE VEROSIMILITUD (LR) ===\n")
probit_nulo <- glm(trabaja ~ 1, data = datos, family = binomial(link = "probit"))
LR_stat     <- 2 * (logLik(probit_modelo) - logLik(probit_nulo))
LR_df       <- length(coef(probit_modelo)) - 1
LR_pval     <- pchisq(LR_stat, df = LR_df, lower.tail = FALSE)
cat(sprintf("LR = %.2f, gl = %d, p-valor = %.6f\n", LR_stat, LR_df, LR_pval))
if (LR_pval < 0.05) message("-> Rechazamos H0: modelo conjuntamente significativo.")

# --- Clasificacion ---
cat("\n=== TABLA DE CLASIFICACION (umbral = 0.5) ===\n")
datos$prob_probit <- predict(probit_modelo, type = "response")
datos$pred_probit <- ifelse(datos$prob_probit > 0.5, 1, 0)
tabla_clasif_p <- table(Observado = datos$trabaja, Predicho = datos$pred_probit)
print(tabla_clasif_p)
cat(sprintf("Tasa de aciertos: %.1f%%\n", 100 * sum(diag(tabla_clasif_p)) / sum(tabla_clasif_p)))

# --- Graficos Probit ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))

edu_seq <- seq(min(datos$educacion), max(datos$educacion), length.out = 100)
datos_pred <- data.frame(
  educacion = edu_seq, edad = mean(datos$edad),
  sexo = factor("Hombre", levels = levels(datos$sexo)),
  experiencia = mean(datos$experiencia),
  hijos = median(datos$hijos), ingreso_familiar = mean(datos$ingreso_familiar)
)
prob_hombre <- predict(probit_modelo, newdata = datos_pred, type = "response")
datos_pred$sexo <- factor("Mujer", levels = levels(datos$sexo))
prob_mujer  <- predict(probit_modelo, newdata = datos_pred, type = "response")

plot(edu_seq, prob_hombre, type = "l", col = "#3498DB", lwd = 2,
     ylim = c(0, 1), xlab = "Anos de educacion", ylab = "P(trabaja = 1)",
     main = "Probit: P(trabaja) vs Educacion")
lines(edu_seq, prob_mujer, col = "#E91E8C", lwd = 2, lty = 2)
abline(h = 0.5, col = "gray60", lty = 3)
legend("topleft", legend = c("Hombre", "Mujer"),
       col = c("#3498DB", "#E91E8C"), lty = c(1, 2), lwd = 2, bty = "n")

em_df <- as.data.frame(probit_mfx$mfxest)
em_df$variable <- rownames(em_df)
em_df <- em_df[order(em_df$`dF/dx`), ]
barplot(em_df$`dF/dx` * 100, names.arg = em_df$variable,
        horiz = TRUE, las = 1,
        col = ifelse(em_df$`dF/dx` > 0, "#2ECC71", "#E74C3C"),
        main = "Probit: Efectos marginales (pp)",
        xlab = "Efecto marginal (pp)", cex.names = 0.75)
abline(v = 0, col = "black", lwd = 1)

hist(datos$prob_probit, breaks = 30, col = "#9B59B6", border = "white",
     main = "Probit: Distribucion P(trabaja=1)",
     xlab = "Probabilidad predicha", ylab = "Frecuencia")
abline(v = 0.5, col = "red", lty = 2, lwd = 2)

boxplot(prob_probit ~ trabaja, data = datos,
        names = c("No trabaja (y=0)", "Trabaja (y=1)"),
        col = c("#E74C3C", "#2ECC71"),
        main = "Probit: Probabilidades por grupo",
        ylab = "P(trabaja = 1)")
abline(h = 0.5, col = "blue", lty = 2)

par(mfrow = c(1, 1))

pausa()


# ============================================================================
# SECCION 4: MODELO LOGIT
# ============================================================================
# P(trabaja = 1 | x) = Lambda(x'beta) = 1 / (1 + exp(-x'beta))
# Estimacion por MV. Ventaja: interpretacion via odds ratios.
# ============================================================================

logit_modelo <- glm(trabaja ~ educacion + edad + sexo +
                      experiencia + hijos + ingreso_familiar,
                    data = datos, family = binomial(link = "logit"))

cat("=== RESULTADOS DEL MODELO LOGIT ===\n")
print(summary(logit_modelo))

# --- Tabla de coeficientes ---
coefs_l <- summary(logit_modelo)$coefficients
tabla_logit <- data.frame(
  Variable      = rownames(coefs_l),
  Coeficiente   = round(coefs_l[, 1], 4),
  Std_Error     = round(coefs_l[, 2], 4),
  z_valor       = round(coefs_l[, 3], 3),
  p_valor       = round(coefs_l[, 4], 4),
  Significativo = ifelse(coefs_l[, 4] < 0.001, "***",
                  ifelse(coefs_l[, 4] < 0.01,  "**",
                  ifelse(coefs_l[, 4] < 0.05,  "*",
                  ifelse(coefs_l[, 4] < 0.10,  ".",  ""))))
)
rownames(tabla_logit) <- NULL
tabla_dt(tabla_logit, titulo = "Coeficientes del modelo Logit", filas = FALSE)

pausa()

# --- Odds Ratios ---
cat("\n=== ODDS RATIOS e IC al 95% ===\n")
OR       <- exp(coef(logit_modelo))
IC_lower <- exp(confint.default(logit_modelo)[, 1])
IC_upper <- exp(confint.default(logit_modelo)[, 2])

tabla_OR <- data.frame(
  Variable  = names(OR),
  OR        = round(OR, 4),
  IC_inf_95 = round(IC_lower, 4),
  IC_sup_95 = round(IC_upper, 4),
  Efecto    = ifelse(OR > 1, "Aumenta odds (+)",
               ifelse(OR < 1, "Reduce odds (-)", "Sin efecto"))
)
rownames(tabla_OR) <- NULL
tabla_dt(tabla_OR, titulo = "Odds Ratios del modelo Logit", filas = FALSE)

cat("\nINTERPRETACION:\n")
OR_edu <- exp(coef(logit_modelo)["educacion"])
cat(sprintf("  educacion: OR = %.3f -> +1 ano aumenta odds en %.1f%%\n",
            OR_edu, (OR_edu - 1) * 100))
OR_sexo <- exp(coef(logit_modelo)["sexoMujer"])
cat(sprintf("  sexo (Mujer): OR = %.3f -> odds de mujer son %.1f%% menores\n",
            OR_sexo, abs(1 - OR_sexo) * 100))

pausa()

# --- Efectos marginales ---
cat("\n=== EFECTOS MARGINALES LOGIT (en la media) ===\n")
logit_mfx <- logitmfx(trabaja ~ educacion + edad + sexo +
                        experiencia + hijos + ingreso_familiar,
                      data = datos, atmean = TRUE)
print(logit_mfx)

cat("\n=== EFECTOS MARGINALES LOGIT (promedio - AME) ===\n")
logit_mfx_avg <- logitmfx(trabaja ~ educacion + edad + sexo +
                             experiencia + hijos + ingreso_familiar,
                           data = datos, atmean = FALSE)
print(logit_mfx_avg)

# --- Bondad de ajuste ---
cat("\n=== BONDAD DE AJUSTE ===\n")
pseudoR2_logit <- 1 - (logit_modelo$deviance / logit_modelo$null.deviance)
cat(sprintf("Pseudo-R2 McFadden: %.4f\n", pseudoR2_logit))
cat(sprintf("AIC: %.2f  |  BIC: %.2f\n\n", AIC(logit_modelo), BIC(logit_modelo)))

# --- Test de Wald ---
cat("=== TEST DE WALD ===\n")
wald_logit <- wald.test(b = coef(logit_modelo), Sigma = vcov(logit_modelo), Terms = 2:7)
print(wald_logit)

# --- Clasificacion ---
cat("\n=== TABLA DE CLASIFICACION (umbral = 0.5) ===\n")
datos$prob_logit <- predict(logit_modelo, type = "response")
datos$pred_logit <- ifelse(datos$prob_logit > 0.5, 1, 0)
tabla_clasif_l <- table(Observado = datos$trabaja, Predicho = datos$pred_logit)
print(tabla_clasif_l)
cat(sprintf("Tasa de aciertos: %.1f%%\n",
            100 * sum(diag(tabla_clasif_l)) / sum(tabla_clasif_l)))

# --- Graficos Logit ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))

# Curva sigmoide
edu_seq <- seq(min(datos$educacion), max(datos$educacion), length.out = 100)
datos_pred_h <- data.frame(
  educacion = edu_seq, edad = mean(datos$edad),
  sexo = factor("Hombre", levels = levels(datos$sexo)),
  experiencia = mean(datos$experiencia),
  hijos = median(datos$hijos), ingreso_familiar = mean(datos$ingreso_familiar)
)
datos_pred_m <- datos_pred_h
datos_pred_m$sexo <- factor("Mujer", levels = levels(datos$sexo))

prob_h <- predict(logit_modelo, newdata = datos_pred_h, type = "response")
prob_m <- predict(logit_modelo, newdata = datos_pred_m, type = "response")

plot(edu_seq, prob_h, type = "l", col = "#3498DB", lwd = 2.5,
     ylim = c(0, 1), xlab = "Anos de educacion", ylab = "P(trabaja = 1)",
     main = "Logit: Funcion sigmoide")
lines(edu_seq, prob_m, col = "#E91E8C", lwd = 2.5, lty = 2)
abline(h = 0.5, col = "gray60", lty = 3)
legend("topleft", legend = c("Hombre", "Mujer"),
       col = c("#3498DB", "#E91E8C"), lty = c(1, 2), lwd = 2, bty = "n")

# Forest plot OR
OR_vars <- OR[-1]
OR_low  <- IC_lower[-1]
OR_high <- IC_upper[-1]
n_vars  <- length(OR_vars)
plot(OR_vars, 1:n_vars, pch = 16, col = "#2C3E50",
     xlim = c(min(OR_low)*0.8, max(OR_high)*1.2),
     xlab = "Odds Ratio (exp(beta))", ylab = "",
     yaxt = "n", main = "Logit: Odds Ratios con IC 95%", log = "x")
segments(OR_low, 1:n_vars, OR_high, 1:n_vars, col = "#2C3E50", lwd = 2)
axis(2, at = 1:n_vars, labels = names(OR_vars), las = 1, cex.axis = 0.8)
abline(v = 1, col = "red", lty = 2, lwd = 2)

# Efectos marginales Logit
em_df_l <- as.data.frame(logit_mfx$mfxest)
em_df_l$variable <- rownames(em_df_l)
em_df_l <- em_df_l[order(em_df_l$`dF/dx`), ]
barplot(em_df_l$`dF/dx` * 100, names.arg = em_df_l$variable,
        horiz = TRUE, las = 1,
        col = ifelse(em_df_l$`dF/dx` > 0, "#27AE60", "#C0392B"),
        main = "Logit: Efectos marginales (pp)",
        xlab = "Cambio en P(trabaja) en pp", cex.names = 0.75)
abline(v = 0, col = "black")

# Comparacion Logit vs Probit (scatter)
em_comp <- data.frame(
  em_logit  = logit_mfx$mfxest[, "dF/dx"] * 100,
  em_probit = probit_mfx$mfxest[, "dF/dx"] * 100
)
plot(em_comp$em_probit, em_comp$em_logit,
     pch = 16, col = "#8E44AD", cex = 1.5,
     xlab = "EM Probit (pp)", ylab = "EM Logit (pp)",
     main = "Logit vs Probit: Efectos marginales")
abline(0, 1, col = "red", lty = 2, lwd = 2)
text(em_comp$em_probit, em_comp$em_logit,
     labels = rownames(em_comp), pos = 4, cex = 0.7)

par(mfrow = c(1, 1))

pausa()


# ============================================================================
# SECCION 5: COMPARACION COMPLETA MLP vs PROBIT vs LOGIT
# ============================================================================

cat("=== TABLA 1: COMPARACION DE COEFICIENTES ===\n")
cat("(Los coeficientes NO son comparables entre modelos; importan signos y significatividad)\n\n")

coef_mlp    <- coef(mlp_modelo)
coef_probit <- coef(probit_modelo)
coef_logit  <- coef(logit_modelo)
pval_mlp    <- summary(mlp_modelo)$coefficients[, 4]
pval_probit <- summary(probit_modelo)$coefficients[, 4]
pval_logit  <- summary(logit_modelo)$coefficients[, 4]

sig <- function(p) ifelse(p < 0.001, "***",
                   ifelse(p < 0.01, "**",
                   ifelse(p < 0.05, "*",
                   ifelse(p < 0.10, ".", ""))))

tabla_coef <- data.frame(
  Variable = names(coef_mlp),
  MLP      = paste0(round(coef_mlp, 4), sig(pval_mlp)),
  Probit   = paste0(round(coef_probit, 4), sig(pval_probit)),
  Logit    = paste0(round(coef_logit, 4), sig(pval_logit))
)
tabla_dt(tabla_coef, titulo = "Comparacion de coeficientes: MLP vs Probit vs Logit", filas = FALSE)

pausa()

# --- Efectos marginales comparados ---
cat("\n=== TABLA 2: EFECTOS MARGINALES (pp) ===\n")
em_mlp    <- coef_mlp[-1] * 100
em_probit <- probit_mfx_avg$mfxest[, "dF/dx"] * 100
em_logit  <- logit_mfx_avg$mfxest[, "dF/dx"] * 100

tabla_em <- data.frame(
  Variable       = names(em_mlp),
  MLP_pp         = round(em_mlp, 3),
  Probit_pp      = round(em_probit, 3),
  Logit_pp       = round(em_logit, 3),
  Dif_Prob_Logit = round(em_probit - em_logit, 4)
)
rownames(tabla_em) <- NULL
tabla_dt(tabla_em, titulo = "Efectos marginales comparados (pp)", filas = FALSE)

cat("\nNOTA: Los EM de Probit y Logit son practicamente identicos.\n")

pausa()

# --- Bondad de ajuste comparada ---
cat("\n=== TABLA 3: METRICAS DE AJUSTE ===\n")
R2_mlp     <- summary(mlp_modelo)$r.squared
pseudoR2_p <- 1 - probit_modelo$deviance / probit_modelo$null.deviance
pseudoR2_l <- 1 - logit_modelo$deviance  / logit_modelo$null.deviance

aciertos_mlp <- mean(ifelse(datos$prob_mlp > 0.5, 1, 0) == datos$trabaja) * 100
aciertos_p   <- mean(ifelse(datos$prob_probit > 0.5, 1, 0) == datos$trabaja) * 100
aciertos_l   <- mean(ifelse(datos$prob_logit  > 0.5, 1, 0) == datos$trabaja) * 100
fuera_rango  <- sum(datos$prob_mlp < 0 | datos$prob_mlp > 1)

tabla_ajuste <- data.frame(
  Metrica = c("R2 / Pseudo-R2 McFadden", "AIC", "BIC",
              "Tasa de aciertos (%)", "Predicciones fuera [0,1]"),
  MLP    = c(round(R2_mlp, 4), round(AIC(mlp_modelo), 1),
             round(BIC(mlp_modelo), 1), round(aciertos_mlp, 1), fuera_rango),
  Probit = c(round(pseudoR2_p, 4), round(AIC(probit_modelo), 1),
             round(BIC(probit_modelo), 1), round(aciertos_p, 1), 0),
  Logit  = c(round(pseudoR2_l, 4), round(AIC(logit_modelo), 1),
             round(BIC(logit_modelo), 1), round(aciertos_l, 1), 0)
)
tabla_dt(tabla_ajuste, titulo = "Metricas de ajuste: MLP vs Probit vs Logit", filas = FALSE)

# --- Grafico principal: curvas de probabilidad predicha ---
edu_seq <- seq(6, 20, length.out = 200)
datos_grid <- data.frame(
  educacion = edu_seq, edad = mean(datos$edad),
  sexo = factor("Hombre", levels = levels(datos$sexo)),
  experiencia = mean(datos$experiencia),
  hijos = 1, ingreso_familiar = mean(datos$ingreso_familiar)
)

pred_mlp_c    <- predict(mlp_modelo,    newdata = datos_grid)
pred_probit_c <- predict(probit_modelo, newdata = datos_grid, type = "response")
pred_logit_c  <- predict(logit_modelo,  newdata = datos_grid, type = "response")

par(mfrow = c(1, 1), mar = c(5, 5, 4, 2))
set.seed(42)
plot(datos$educacion + runif(nrow(datos), -0.4, 0.4),
     datos$trabaja + runif(nrow(datos), -0.03, 0.03),
     pch = 16, cex = 0.5, col = rgb(0.5, 0.5, 0.5, 0.3),
     xlab = "Anos de educacion", ylab = "P(trabaja = 1)",
     main = "MLP, Probit y Logit: Probabilidades predichas",
     ylim = c(-0.15, 1.15))
lines(edu_seq, pred_mlp_c,    col = "#E74C3C", lwd = 2.5)
lines(edu_seq, pred_probit_c, col = "#2ECC71", lwd = 2.5, lty = 2)
lines(edu_seq, pred_logit_c,  col = "#3498DB", lwd = 2.5, lty = 3)
abline(h = c(0, 1), col = "gray40", lty = 2, lwd = 1.5)
legend("topleft",
       legend = c("Datos observados", "MLP (lineal)", "Probit", "Logit"),
       pch = c(16, NA, NA, NA), lty = c(NA, 1, 2, 3),
       lwd = c(NA, 2.5, 2.5, 2.5),
       col = c(rgb(0.5, 0.5, 0.5, 0.5), "#E74C3C", "#2ECC71", "#3498DB"),
       bty = "n", cex = 0.85)

# --- Diferencias entre predicciones ---
par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))

diff_mlp_logit <- pred_mlp_c - pred_logit_c
plot(edu_seq, diff_mlp_logit, type = "l", col = "#E74C3C", lwd = 2,
     xlab = "Anos de educacion", ylab = "P_MLP - P_Logit",
     main = "Diferencia MLP - Logit")
abline(h = 0, lty = 2, col = "gray50")

diff_probit_logit <- pred_probit_c - pred_logit_c
plot(edu_seq, diff_probit_logit, type = "l", col = "#2ECC71", lwd = 2,
     xlab = "Anos de educacion", ylab = "P_Probit - P_Logit",
     main = "Diferencia Probit - Logit")
abline(h = 0, lty = 2, col = "gray50")

par(mfrow = c(1, 1))

cat("\n=== RESUMEN COMPARATIVO ===\n")
cat("MLP   : Simple pero con problemas teoricos (P fuera [0,1], EM constantes)\n")
cat("PROBIT: Basado en Normal estandar. Solido teoricamente. Uso: error ~ N(0,1).\n")
cat("LOGIT : Basado en distribucion logistica. Odds ratios interpretables.\n")
cat("Conclusion: Probit y Logit dan resultados practicamente identicos.\n")

cat("\n=== FIN DEL SCRIPT TEMA 2 ===\n")