← Catálogo
· Capítulo 2
T02_mECO_Script_Eleccion_Discreta.R
# =============================================================================
# TEMA 2 — ELECCIÓN DISCRETA (LOGIT Y PROBIT): SCRIPT INTERACTIVO
# =============================================================================
# Autor: Carlos de Anta Puig
# Economista · Perito Financiero
# Miembro del Colegio de Economistas de Madrid
# Miembro del Instituto Español de Analistas Financieros (IEAF)
# Profesor de Econometría y Microeconometría
# carlos@cwconsultores.com
#
# Manual de Microeconometría - Digital Reasons (2026)
# https://github.com/carlanta/MicroEconometrics
#
# Versión: 1.0
# Fecha: 2026-03-27
#
# Ejecutar bloque a bloque en RStudio.
# =============================================================================
#
# Autor: Carlos de Anta Puig
# Economista - Perito Financiero
# Miembro del Colegio de Economistas de Madrid
# Miembro del Instituto Espanol de Analistas Financieros (IEAF)
# Profesor de Econometría y Microeconometría
# carlosmaria.deanta@unir.net
#
# Version: 1.0 | Fecha: Marzo 2026
#
# Descripcion:
# Script pedagogico completo del Tema 2. Cubre:
# - Generacion del dataset simulado de participacion laboral
# - Modelo Lineal de Probabilidad (MLP) y sus limitaciones
# - Modelo Probit: estimacion, efectos marginales, bondad de ajuste
# - Modelo Logit: odds ratios, efectos marginales, comparacion
# - Comparacion completa MLP vs Probit vs Logit
#
# Uso:
# Ejecutar el script completo en RStudio. Se detiene entre secciones
# con readline() para que el alumno pueda leer los resultados.
# ============================================================================
# ============================================================================
# CARGA DE LIBRERIAS
# ============================================================================
suppressPackageStartupMessages({
library(lmtest) # test de heterocedasticidad (bptest)
library(sandwich) # errores estandar robustos
library(mfx) # efectos marginales: probitmfx(), logitmfx()
library(aod) # test de Wald: wald.test()
library(DT) # tablas interactivas
})
# Funcion auxiliar para pausas interactivas
pausa <- function() invisible(readline(prompt = "Pulsa ENTER para continuar..."))
# Funcion auxiliar para tablas interactivas DT
tabla_dt <- function(datos, titulo = "", decimales = 2, filas = TRUE) {
if (is.numeric(datos) && !is.null(names(datos))) {
datos <- data.frame(Nombre = names(datos), Valor = round(datos, decimales))
filas <- FALSE
}
cat("[Abriendo tabla en el Viewer...]\n")
print(DT::datatable(
datos,
caption = titulo,
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel'),
pageLength = 15,
scrollX = TRUE
),
rownames = filas
))
}
# Funcion auxiliar para transparencias en graficos
alpha_color <- function(col, alpha) {
rgb_vals <- col2rgb(col) / 255
rgb(rgb_vals[1], rgb_vals[2], rgb_vals[3], alpha = alpha)
}
# ============================================================================
# SECCION 1: GENERACION DEL DATASET
# ============================================================================
# Generamos un dataset simulado de participacion laboral en Espana.
# Variable dependiente: trabaja (1 = participa, 0 = no participa)
# El proceso generador de datos (DGP) usa una funcion logistica.
# ============================================================================
set.seed(2024)
N <- 500
# --- Variables explicativas ---
educacion <- round(runif(N, min = 6, max = 20))
edad <- round(runif(N, min = 18, max = 65))
sexo <- rbinom(N, size = 1, prob = 0.50)
experiencia <- pmax(0, round(edad - educacion - 6 + rnorm(N, mean = 0, sd = 3)))
hijos <- sample(0:4, N, replace = TRUE, prob = c(0.35, 0.30, 0.20, 0.10, 0.05))
ingreso_familiar <- round(pmax(0, rnorm(N, mean = 10, sd = 6)), 1)
# --- Indice latente y probabilidad (DGP verdadero) ---
b0 <- -2.00; b1 <- 0.20; b2 <- 0.03; b3 <- -0.60
b4 <- 0.05; b5 <- -0.35; b6 <- -0.08
z <- b0 + b1*educacion + b2*edad + b3*sexo +
b4*experiencia + b5*hijos + b6*ingreso_familiar
prob_trabaja <- 1 / (1 + exp(-z))
trabaja <- rbinom(N, size = 1, prob = prob_trabaja)
# --- Construccion del data frame ---
datos <- data.frame(
id = 1:N,
trabaja = trabaja,
educacion = educacion,
edad = edad,
sexo = factor(sexo, levels = c(0, 1), labels = c("Hombre", "Mujer")),
experiencia = experiencia,
hijos = hijos,
ingreso_familiar = ingreso_familiar
)
# --- Exploracion basica ---
cat("=== DATASET GENERADO ===\n")
cat("Observaciones:", N, "\n")
cat("Proporcion que trabaja:", round(mean(datos$trabaja), 3), "\n\n")
print(summary(datos[, -1]))
cat("\nTasa de participacion por sexo:\n")
print(tapply(datos$trabaja, datos$sexo, mean))
# Tabla interactiva del dataset
tabla_dt(datos, titulo = "Dataset: participacion laboral simulada")
# --- Grafico exploratorio ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
barplot(table(datos$trabaja),
names.arg = c("No trabaja (0)", "Trabaja (1)"),
col = c("#E74C3C", "#2ECC71"),
main = "Distribucion de 'trabaja'",
ylab = "Frecuencia", xlab = "")
boxplot(educacion ~ trabaja, data = datos,
names = c("No trabaja", "Trabaja"),
col = c("#E74C3C", "#2ECC71"),
main = "Educacion segun participacion",
ylab = "Anos de educacion")
barplot(tapply(datos$trabaja, datos$sexo, mean),
col = c("#3498DB", "#E91E8C"),
main = "Tasa de participacion por sexo",
ylab = "Proporcion que trabaja", ylim = c(0, 1))
abline(h = mean(datos$trabaja), lty = 2, col = "gray50")
hist(prob_trabaja, breaks = 20, col = "#9B59B6",
main = "Distribucion de P(trabaja=1) verdadera",
xlab = "Probabilidad", ylab = "Frecuencia")
abline(v = 0.5, lty = 2, col = "red")
par(mfrow = c(1, 1))
# Para que se pare
pausa()
# ============================================================================
# SECCION 2: MODELO LINEAL DE PROBABILIDAD (MLP)
# ============================================================================
# P(trabaja = 1 | x) = b0 + b1*educacion + ... (estimacion por MCO)
# Demostramos sus tres limitaciones principales.
# ============================================================================
mlp_modelo <- lm(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos)
cat("=== RESULTADOS DEL MLP (MCO) ===\n")
print(summary(mlp_modelo))
# --- Tabla de coeficientes ---
coefs <- summary(mlp_modelo)$coefficients
tabla_mlp <- data.frame(
Variable = rownames(coefs),
Coeficiente = round(coefs[, 1], 4),
Std_Error = round(coefs[, 2], 4),
t_valor = round(coefs[, 3], 3),
p_valor = round(coefs[, 4], 4),
Significativo = ifelse(coefs[, 4] < 0.001, "***",
ifelse(coefs[, 4] < 0.01, "**",
ifelse(coefs[, 4] < 0.05, "*",
ifelse(coefs[, 4] < 0.10, ".", ""))))
)
rownames(tabla_mlp) <- NULL
tabla_dt(tabla_mlp, titulo = "Coeficientes del MLP", filas = FALSE)
# --- Interpretacion ---
cat("\n=== INTERPRETACION (MLP: coefs = efectos marginales) ===\n")
b <- coef(mlp_modelo)
cat(sprintf("educacion : +1 ano -> +%.1f pp en P(trabaja)\n", b["educacion"]*100))
cat(sprintf("sexo (Mujer) : ser mujer -> %.1f pp\n", b["sexoMujer"]*100))
cat(sprintf("hijos : +1 hijo -> %.1f pp\n", b["hijos"]*100))
cat(sprintf("ingreso_fam : +1.000 EUR -> %.1f pp\n", b["ingreso_familiar"]*100))
# --- Limitacion 1: predicciones fuera de [0,1] ---
datos$prob_mlp <- predict(mlp_modelo, type = "response")
n_fuera <- sum(datos$prob_mlp < 0 | datos$prob_mlp > 1)
cat(sprintf("\nPredicciones fuera de [0,1]: %d de %d (%.1f%%)\n",
n_fuera, nrow(datos), 100 * n_fuera / nrow(datos)))
# --- Limitacion 2: heterocedasticidad ---
cat("\n=== TEST DE HETEROCEDASTICIDAD (Breusch-Pagan) ===\n")
bp_test <- bptest(mlp_modelo)
print(bp_test)
if (bp_test$p.value < 0.05) {
message("-> Rechazamos H0: HAY heterocedasticidad (p < 0.05)")
} else {
message("-> No se rechaza H0: sin evidencia de heterocedasticidad")
}
# --- Errores robustos ---
cat("\n=== COEFICIENTES CON ERRORES ROBUSTOS (HC1) ===\n")
print(coeftest(mlp_modelo, vcov = vcovHC(mlp_modelo, type = "HC1")))
# --- Graficos MLP ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
hist(datos$prob_mlp, breaks = 30, col = "#3498DB", border = "white",
main = "MLP: Distribucion de probabilidades predichas",
xlab = "P(trabaja = 1)", ylab = "Frecuencia")
abline(v = c(0, 1), col = "red", lty = 2, lwd = 2)
mlp_simple <- lm(trabaja ~ educacion, data = datos)
plot(datos$educacion + runif(nrow(datos), -0.3, 0.3),
datos$trabaja,
pch = 16, col = alpha_color("#2ECC71", 0.3),
main = "MLP: Recta ajustada (trabaja ~ educacion)",
xlab = "Anos de educacion", ylab = "P(trabaja = 1)")
abline(mlp_simple, col = "#E74C3C", lwd = 2)
abline(h = c(0, 1), col = "gray50", lty = 2)
plot(mlp_modelo$fitted.values, mlp_modelo$residuals,
pch = 16, col = alpha_color("#9B59B6", 0.4),
main = "MLP: Residuos vs Valores Ajustados",
xlab = "Valores ajustados", ylab = "Residuos")
abline(h = 0, col = "red", lty = 2, lwd = 2)
edu_seq <- seq(6, 20, length.out = 100)
pred_mlp_line <- coef(mlp_simple)[1] + coef(mlp_simple)[2] * edu_seq
logit_simple <- glm(trabaja ~ educacion, data = datos, family = binomial)
pred_logit_line <- predict(logit_simple,
newdata = data.frame(educacion = edu_seq),
type = "response")
plot(edu_seq, pred_mlp_line,
type = "l", col = "#E74C3C", lwd = 2, ylim = c(-0.1, 1.1),
main = "MLP vs Logit: forma funcional",
xlab = "Anos de educacion", ylab = "P(trabaja = 1)")
lines(edu_seq, pred_logit_line, col = "#2980B9", lwd = 2, lty = 2)
abline(h = c(0, 1), col = "gray70", lty = 3)
legend("topleft", legend = c("MLP (lineal)", "Logit (no lineal)"),
col = c("#E74C3C", "#2980B9"), lty = c(1, 2), lwd = 2, bty = "n")
par(mfrow = c(1, 1))
cat("\n=== RESUMEN: LIMITACIONES DEL MLP ===\n")
cat("1. Probabilidades fuera de [0,1]\n")
cat("2. Heterocedasticidad del error\n")
cat("3. Efecto marginal constante (no realista)\n")
cat("-> Solucion: Probit o Logit\n")
pausa()
# ============================================================================
# SECCION 3: MODELO PROBIT
# ============================================================================
# P(trabaja = 1 | x) = Phi(x'beta)
# Phi = CDF de la Normal Estandar. Estimacion por Maxima Verosimilitud.
# ============================================================================
probit_modelo <- glm(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos, family = binomial(link = "probit"))
cat("=== RESULTADOS DEL MODELO PROBIT ===\n")
print(summary(probit_modelo))
# --- Tabla de coeficientes ---
coefs_p <- summary(probit_modelo)$coefficients
tabla_probit <- data.frame(
Variable = rownames(coefs_p),
Coeficiente = round(coefs_p[, 1], 4),
Std_Error = round(coefs_p[, 2], 4),
z_valor = round(coefs_p[, 3], 3),
p_valor = round(coefs_p[, 4], 4),
Significativo = ifelse(coefs_p[, 4] < 0.001, "***",
ifelse(coefs_p[, 4] < 0.01, "**",
ifelse(coefs_p[, 4] < 0.05, "*",
ifelse(coefs_p[, 4] < 0.10, ".", ""))))
)
rownames(tabla_probit) <- NULL
tabla_dt(tabla_probit, titulo = "Coeficientes del modelo Probit", filas = FALSE)
# --- Interpretacion de coeficientes (solo signo) ---
cat("\nATENCION: En Probit los coeficientes NO son efectos marginales.\n")
cat("Solo informan la DIRECCION del efecto:\n\n")
b <- coef(probit_modelo)
for (nm in names(b)[-1]) {
signo <- ifelse(b[nm] > 0, "AUMENTA (+)", "DISMINUYE (-)")
cat(sprintf(" %-20s -> P(trabaja=1) %s\n", nm, signo))
}
pausa()
# --- Efectos marginales ---
cat("\n=== EFECTOS MARGINALES (en la media muestral) ===\n")
probit_mfx <- probitmfx(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos, atmean = TRUE)
print(probit_mfx)
cat("\n=== EFECTOS MARGINALES (promedio - AME) ===\n")
probit_mfx_avg <- probitmfx(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos, atmean = FALSE)
print(probit_mfx_avg)
# --- Interpretacion de efectos marginales ---
cat("\n=== INTERPRETACION DE EFECTOS MARGINALES ===\n")
em <- probit_mfx$mfxest
cat(sprintf("educacion : +1 ano -> +%.2f pp en P(trabaja)\n", em["educacion","dF/dx"]*100))
cat(sprintf("sexo (Mujer): ser mujer -> %.2f pp\n", em["sexoMujer","dF/dx"]*100))
cat(sprintf("hijos : +1 hijo -> %.2f pp\n", em["hijos","dF/dx"]*100))
pausa()
# --- Bondad de ajuste ---
cat("\n=== BONDAD DE AJUSTE ===\n")
pseudoR2_mcfadden <- 1 - (probit_modelo$deviance / probit_modelo$null.deviance)
cat(sprintf("Pseudo-R2 de McFadden: %.4f\n", pseudoR2_mcfadden))
cat(" (Valores entre 0.2 y 0.4 = buen ajuste en modelos binarios)\n\n")
cat(sprintf("AIC: %.2f\n", AIC(probit_modelo)))
cat(sprintf("BIC: %.2f\n\n", BIC(probit_modelo)))
# --- Contrastes ---
cat("=== TEST DE WALD (significacion conjunta) ===\n")
cat("H0: todos los coeficientes (excepto constante) = 0\n\n")
wald_res <- wald.test(b = coef(probit_modelo), Sigma = vcov(probit_modelo), Terms = 2:7)
print(wald_res)
cat("\n=== TEST DE RATIO DE VEROSIMILITUD (LR) ===\n")
probit_nulo <- glm(trabaja ~ 1, data = datos, family = binomial(link = "probit"))
LR_stat <- 2 * (logLik(probit_modelo) - logLik(probit_nulo))
LR_df <- length(coef(probit_modelo)) - 1
LR_pval <- pchisq(LR_stat, df = LR_df, lower.tail = FALSE)
cat(sprintf("LR = %.2f, gl = %d, p-valor = %.6f\n", LR_stat, LR_df, LR_pval))
if (LR_pval < 0.05) message("-> Rechazamos H0: modelo conjuntamente significativo.")
# --- Clasificacion ---
cat("\n=== TABLA DE CLASIFICACION (umbral = 0.5) ===\n")
datos$prob_probit <- predict(probit_modelo, type = "response")
datos$pred_probit <- ifelse(datos$prob_probit > 0.5, 1, 0)
tabla_clasif_p <- table(Observado = datos$trabaja, Predicho = datos$pred_probit)
print(tabla_clasif_p)
cat(sprintf("Tasa de aciertos: %.1f%%\n", 100 * sum(diag(tabla_clasif_p)) / sum(tabla_clasif_p)))
# --- Graficos Probit ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
edu_seq <- seq(min(datos$educacion), max(datos$educacion), length.out = 100)
datos_pred <- data.frame(
educacion = edu_seq, edad = mean(datos$edad),
sexo = factor("Hombre", levels = levels(datos$sexo)),
experiencia = mean(datos$experiencia),
hijos = median(datos$hijos), ingreso_familiar = mean(datos$ingreso_familiar)
)
prob_hombre <- predict(probit_modelo, newdata = datos_pred, type = "response")
datos_pred$sexo <- factor("Mujer", levels = levels(datos$sexo))
prob_mujer <- predict(probit_modelo, newdata = datos_pred, type = "response")
plot(edu_seq, prob_hombre, type = "l", col = "#3498DB", lwd = 2,
ylim = c(0, 1), xlab = "Anos de educacion", ylab = "P(trabaja = 1)",
main = "Probit: P(trabaja) vs Educacion")
lines(edu_seq, prob_mujer, col = "#E91E8C", lwd = 2, lty = 2)
abline(h = 0.5, col = "gray60", lty = 3)
legend("topleft", legend = c("Hombre", "Mujer"),
col = c("#3498DB", "#E91E8C"), lty = c(1, 2), lwd = 2, bty = "n")
em_df <- as.data.frame(probit_mfx$mfxest)
em_df$variable <- rownames(em_df)
em_df <- em_df[order(em_df$`dF/dx`), ]
barplot(em_df$`dF/dx` * 100, names.arg = em_df$variable,
horiz = TRUE, las = 1,
col = ifelse(em_df$`dF/dx` > 0, "#2ECC71", "#E74C3C"),
main = "Probit: Efectos marginales (pp)",
xlab = "Efecto marginal (pp)", cex.names = 0.75)
abline(v = 0, col = "black", lwd = 1)
hist(datos$prob_probit, breaks = 30, col = "#9B59B6", border = "white",
main = "Probit: Distribucion P(trabaja=1)",
xlab = "Probabilidad predicha", ylab = "Frecuencia")
abline(v = 0.5, col = "red", lty = 2, lwd = 2)
boxplot(prob_probit ~ trabaja, data = datos,
names = c("No trabaja (y=0)", "Trabaja (y=1)"),
col = c("#E74C3C", "#2ECC71"),
main = "Probit: Probabilidades por grupo",
ylab = "P(trabaja = 1)")
abline(h = 0.5, col = "blue", lty = 2)
par(mfrow = c(1, 1))
pausa()
# ============================================================================
# SECCION 4: MODELO LOGIT
# ============================================================================
# P(trabaja = 1 | x) = Lambda(x'beta) = 1 / (1 + exp(-x'beta))
# Estimacion por MV. Ventaja: interpretacion via odds ratios.
# ============================================================================
logit_modelo <- glm(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos, family = binomial(link = "logit"))
cat("=== RESULTADOS DEL MODELO LOGIT ===\n")
print(summary(logit_modelo))
# --- Tabla de coeficientes ---
coefs_l <- summary(logit_modelo)$coefficients
tabla_logit <- data.frame(
Variable = rownames(coefs_l),
Coeficiente = round(coefs_l[, 1], 4),
Std_Error = round(coefs_l[, 2], 4),
z_valor = round(coefs_l[, 3], 3),
p_valor = round(coefs_l[, 4], 4),
Significativo = ifelse(coefs_l[, 4] < 0.001, "***",
ifelse(coefs_l[, 4] < 0.01, "**",
ifelse(coefs_l[, 4] < 0.05, "*",
ifelse(coefs_l[, 4] < 0.10, ".", ""))))
)
rownames(tabla_logit) <- NULL
tabla_dt(tabla_logit, titulo = "Coeficientes del modelo Logit", filas = FALSE)
pausa()
# --- Odds Ratios ---
cat("\n=== ODDS RATIOS e IC al 95% ===\n")
OR <- exp(coef(logit_modelo))
IC_lower <- exp(confint.default(logit_modelo)[, 1])
IC_upper <- exp(confint.default(logit_modelo)[, 2])
tabla_OR <- data.frame(
Variable = names(OR),
OR = round(OR, 4),
IC_inf_95 = round(IC_lower, 4),
IC_sup_95 = round(IC_upper, 4),
Efecto = ifelse(OR > 1, "Aumenta odds (+)",
ifelse(OR < 1, "Reduce odds (-)", "Sin efecto"))
)
rownames(tabla_OR) <- NULL
tabla_dt(tabla_OR, titulo = "Odds Ratios del modelo Logit", filas = FALSE)
cat("\nINTERPRETACION:\n")
OR_edu <- exp(coef(logit_modelo)["educacion"])
cat(sprintf(" educacion: OR = %.3f -> +1 ano aumenta odds en %.1f%%\n",
OR_edu, (OR_edu - 1) * 100))
OR_sexo <- exp(coef(logit_modelo)["sexoMujer"])
cat(sprintf(" sexo (Mujer): OR = %.3f -> odds de mujer son %.1f%% menores\n",
OR_sexo, abs(1 - OR_sexo) * 100))
pausa()
# --- Efectos marginales ---
cat("\n=== EFECTOS MARGINALES LOGIT (en la media) ===\n")
logit_mfx <- logitmfx(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos, atmean = TRUE)
print(logit_mfx)
cat("\n=== EFECTOS MARGINALES LOGIT (promedio - AME) ===\n")
logit_mfx_avg <- logitmfx(trabaja ~ educacion + edad + sexo +
experiencia + hijos + ingreso_familiar,
data = datos, atmean = FALSE)
print(logit_mfx_avg)
# --- Bondad de ajuste ---
cat("\n=== BONDAD DE AJUSTE ===\n")
pseudoR2_logit <- 1 - (logit_modelo$deviance / logit_modelo$null.deviance)
cat(sprintf("Pseudo-R2 McFadden: %.4f\n", pseudoR2_logit))
cat(sprintf("AIC: %.2f | BIC: %.2f\n\n", AIC(logit_modelo), BIC(logit_modelo)))
# --- Test de Wald ---
cat("=== TEST DE WALD ===\n")
wald_logit <- wald.test(b = coef(logit_modelo), Sigma = vcov(logit_modelo), Terms = 2:7)
print(wald_logit)
# --- Clasificacion ---
cat("\n=== TABLA DE CLASIFICACION (umbral = 0.5) ===\n")
datos$prob_logit <- predict(logit_modelo, type = "response")
datos$pred_logit <- ifelse(datos$prob_logit > 0.5, 1, 0)
tabla_clasif_l <- table(Observado = datos$trabaja, Predicho = datos$pred_logit)
print(tabla_clasif_l)
cat(sprintf("Tasa de aciertos: %.1f%%\n",
100 * sum(diag(tabla_clasif_l)) / sum(tabla_clasif_l)))
# --- Graficos Logit ---
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
# Curva sigmoide
edu_seq <- seq(min(datos$educacion), max(datos$educacion), length.out = 100)
datos_pred_h <- data.frame(
educacion = edu_seq, edad = mean(datos$edad),
sexo = factor("Hombre", levels = levels(datos$sexo)),
experiencia = mean(datos$experiencia),
hijos = median(datos$hijos), ingreso_familiar = mean(datos$ingreso_familiar)
)
datos_pred_m <- datos_pred_h
datos_pred_m$sexo <- factor("Mujer", levels = levels(datos$sexo))
prob_h <- predict(logit_modelo, newdata = datos_pred_h, type = "response")
prob_m <- predict(logit_modelo, newdata = datos_pred_m, type = "response")
plot(edu_seq, prob_h, type = "l", col = "#3498DB", lwd = 2.5,
ylim = c(0, 1), xlab = "Anos de educacion", ylab = "P(trabaja = 1)",
main = "Logit: Funcion sigmoide")
lines(edu_seq, prob_m, col = "#E91E8C", lwd = 2.5, lty = 2)
abline(h = 0.5, col = "gray60", lty = 3)
legend("topleft", legend = c("Hombre", "Mujer"),
col = c("#3498DB", "#E91E8C"), lty = c(1, 2), lwd = 2, bty = "n")
# Forest plot OR
OR_vars <- OR[-1]
OR_low <- IC_lower[-1]
OR_high <- IC_upper[-1]
n_vars <- length(OR_vars)
plot(OR_vars, 1:n_vars, pch = 16, col = "#2C3E50",
xlim = c(min(OR_low)*0.8, max(OR_high)*1.2),
xlab = "Odds Ratio (exp(beta))", ylab = "",
yaxt = "n", main = "Logit: Odds Ratios con IC 95%", log = "x")
segments(OR_low, 1:n_vars, OR_high, 1:n_vars, col = "#2C3E50", lwd = 2)
axis(2, at = 1:n_vars, labels = names(OR_vars), las = 1, cex.axis = 0.8)
abline(v = 1, col = "red", lty = 2, lwd = 2)
# Efectos marginales Logit
em_df_l <- as.data.frame(logit_mfx$mfxest)
em_df_l$variable <- rownames(em_df_l)
em_df_l <- em_df_l[order(em_df_l$`dF/dx`), ]
barplot(em_df_l$`dF/dx` * 100, names.arg = em_df_l$variable,
horiz = TRUE, las = 1,
col = ifelse(em_df_l$`dF/dx` > 0, "#27AE60", "#C0392B"),
main = "Logit: Efectos marginales (pp)",
xlab = "Cambio en P(trabaja) en pp", cex.names = 0.75)
abline(v = 0, col = "black")
# Comparacion Logit vs Probit (scatter)
em_comp <- data.frame(
em_logit = logit_mfx$mfxest[, "dF/dx"] * 100,
em_probit = probit_mfx$mfxest[, "dF/dx"] * 100
)
plot(em_comp$em_probit, em_comp$em_logit,
pch = 16, col = "#8E44AD", cex = 1.5,
xlab = "EM Probit (pp)", ylab = "EM Logit (pp)",
main = "Logit vs Probit: Efectos marginales")
abline(0, 1, col = "red", lty = 2, lwd = 2)
text(em_comp$em_probit, em_comp$em_logit,
labels = rownames(em_comp), pos = 4, cex = 0.7)
par(mfrow = c(1, 1))
pausa()
# ============================================================================
# SECCION 5: COMPARACION COMPLETA MLP vs PROBIT vs LOGIT
# ============================================================================
cat("=== TABLA 1: COMPARACION DE COEFICIENTES ===\n")
cat("(Los coeficientes NO son comparables entre modelos; importan signos y significatividad)\n\n")
coef_mlp <- coef(mlp_modelo)
coef_probit <- coef(probit_modelo)
coef_logit <- coef(logit_modelo)
pval_mlp <- summary(mlp_modelo)$coefficients[, 4]
pval_probit <- summary(probit_modelo)$coefficients[, 4]
pval_logit <- summary(logit_modelo)$coefficients[, 4]
sig <- function(p) ifelse(p < 0.001, "***",
ifelse(p < 0.01, "**",
ifelse(p < 0.05, "*",
ifelse(p < 0.10, ".", ""))))
tabla_coef <- data.frame(
Variable = names(coef_mlp),
MLP = paste0(round(coef_mlp, 4), sig(pval_mlp)),
Probit = paste0(round(coef_probit, 4), sig(pval_probit)),
Logit = paste0(round(coef_logit, 4), sig(pval_logit))
)
tabla_dt(tabla_coef, titulo = "Comparacion de coeficientes: MLP vs Probit vs Logit", filas = FALSE)
pausa()
# --- Efectos marginales comparados ---
cat("\n=== TABLA 2: EFECTOS MARGINALES (pp) ===\n")
em_mlp <- coef_mlp[-1] * 100
em_probit <- probit_mfx_avg$mfxest[, "dF/dx"] * 100
em_logit <- logit_mfx_avg$mfxest[, "dF/dx"] * 100
tabla_em <- data.frame(
Variable = names(em_mlp),
MLP_pp = round(em_mlp, 3),
Probit_pp = round(em_probit, 3),
Logit_pp = round(em_logit, 3),
Dif_Prob_Logit = round(em_probit - em_logit, 4)
)
rownames(tabla_em) <- NULL
tabla_dt(tabla_em, titulo = "Efectos marginales comparados (pp)", filas = FALSE)
cat("\nNOTA: Los EM de Probit y Logit son practicamente identicos.\n")
pausa()
# --- Bondad de ajuste comparada ---
cat("\n=== TABLA 3: METRICAS DE AJUSTE ===\n")
R2_mlp <- summary(mlp_modelo)$r.squared
pseudoR2_p <- 1 - probit_modelo$deviance / probit_modelo$null.deviance
pseudoR2_l <- 1 - logit_modelo$deviance / logit_modelo$null.deviance
aciertos_mlp <- mean(ifelse(datos$prob_mlp > 0.5, 1, 0) == datos$trabaja) * 100
aciertos_p <- mean(ifelse(datos$prob_probit > 0.5, 1, 0) == datos$trabaja) * 100
aciertos_l <- mean(ifelse(datos$prob_logit > 0.5, 1, 0) == datos$trabaja) * 100
fuera_rango <- sum(datos$prob_mlp < 0 | datos$prob_mlp > 1)
tabla_ajuste <- data.frame(
Metrica = c("R2 / Pseudo-R2 McFadden", "AIC", "BIC",
"Tasa de aciertos (%)", "Predicciones fuera [0,1]"),
MLP = c(round(R2_mlp, 4), round(AIC(mlp_modelo), 1),
round(BIC(mlp_modelo), 1), round(aciertos_mlp, 1), fuera_rango),
Probit = c(round(pseudoR2_p, 4), round(AIC(probit_modelo), 1),
round(BIC(probit_modelo), 1), round(aciertos_p, 1), 0),
Logit = c(round(pseudoR2_l, 4), round(AIC(logit_modelo), 1),
round(BIC(logit_modelo), 1), round(aciertos_l, 1), 0)
)
tabla_dt(tabla_ajuste, titulo = "Metricas de ajuste: MLP vs Probit vs Logit", filas = FALSE)
# --- Grafico principal: curvas de probabilidad predicha ---
edu_seq <- seq(6, 20, length.out = 200)
datos_grid <- data.frame(
educacion = edu_seq, edad = mean(datos$edad),
sexo = factor("Hombre", levels = levels(datos$sexo)),
experiencia = mean(datos$experiencia),
hijos = 1, ingreso_familiar = mean(datos$ingreso_familiar)
)
pred_mlp_c <- predict(mlp_modelo, newdata = datos_grid)
pred_probit_c <- predict(probit_modelo, newdata = datos_grid, type = "response")
pred_logit_c <- predict(logit_modelo, newdata = datos_grid, type = "response")
par(mfrow = c(1, 1), mar = c(5, 5, 4, 2))
set.seed(42)
plot(datos$educacion + runif(nrow(datos), -0.4, 0.4),
datos$trabaja + runif(nrow(datos), -0.03, 0.03),
pch = 16, cex = 0.5, col = rgb(0.5, 0.5, 0.5, 0.3),
xlab = "Anos de educacion", ylab = "P(trabaja = 1)",
main = "MLP, Probit y Logit: Probabilidades predichas",
ylim = c(-0.15, 1.15))
lines(edu_seq, pred_mlp_c, col = "#E74C3C", lwd = 2.5)
lines(edu_seq, pred_probit_c, col = "#2ECC71", lwd = 2.5, lty = 2)
lines(edu_seq, pred_logit_c, col = "#3498DB", lwd = 2.5, lty = 3)
abline(h = c(0, 1), col = "gray40", lty = 2, lwd = 1.5)
legend("topleft",
legend = c("Datos observados", "MLP (lineal)", "Probit", "Logit"),
pch = c(16, NA, NA, NA), lty = c(NA, 1, 2, 3),
lwd = c(NA, 2.5, 2.5, 2.5),
col = c(rgb(0.5, 0.5, 0.5, 0.5), "#E74C3C", "#2ECC71", "#3498DB"),
bty = "n", cex = 0.85)
# --- Diferencias entre predicciones ---
par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))
diff_mlp_logit <- pred_mlp_c - pred_logit_c
plot(edu_seq, diff_mlp_logit, type = "l", col = "#E74C3C", lwd = 2,
xlab = "Anos de educacion", ylab = "P_MLP - P_Logit",
main = "Diferencia MLP - Logit")
abline(h = 0, lty = 2, col = "gray50")
diff_probit_logit <- pred_probit_c - pred_logit_c
plot(edu_seq, diff_probit_logit, type = "l", col = "#2ECC71", lwd = 2,
xlab = "Anos de educacion", ylab = "P_Probit - P_Logit",
main = "Diferencia Probit - Logit")
abline(h = 0, lty = 2, col = "gray50")
par(mfrow = c(1, 1))
cat("\n=== RESUMEN COMPARATIVO ===\n")
cat("MLP : Simple pero con problemas teoricos (P fuera [0,1], EM constantes)\n")
cat("PROBIT: Basado en Normal estandar. Solido teoricamente. Uso: error ~ N(0,1).\n")
cat("LOGIT : Basado en distribucion logistica. Odds ratios interpretables.\n")
cat("Conclusion: Probit y Logit dan resultados practicamente identicos.\n")
cat("\n=== FIN DEL SCRIPT TEMA 2 ===\n")