--- pagetitle: "Meta-analysis: how it works, and where the pitfalls are" lang: en format: revealjs: slide-number: true progress: true controls: true transition: fade center: false css: week14_meta_analysis_interactive.scss code-overflow: wrap embed-resources: true server: shiny execute: echo: false warning: false message: false --- ## Meta-analysis: how it works, and where the pitfalls are {.hero-slide} ::: {.title-meta} ::: {.hero-block}
Bartosz Maćkiewicz
Advanced Statistical Methods and Models in Experimental Design
::: ::: {.title-context}
Week 14 | Meta-analysis
::: ::: ## Lecture plan 1. What meta-analysis is, and why it can be more informative than simply counting "votes." 2. How we measure effect size and combine results across studies in meta-analytic models. 3. How to interpret heterogeneity, and what happens when data or parts of the literature are missing. 4. Why sound hypothesis-testing logic is a prerequisite for trustworthy evidence synthesis. 5. How *p-hacking* can artificially inflate the proportion of statistically significant results, and how *p-curve* can reveal it. 6. How *publication bias* distorts the literature, and what bias corrections such as *trim-and-fill* can and cannot do. ## Why do we need meta-analysis? {.divider-slide} ::: {.divider-lead} Start with a simpler question: what can evidence synthesis tell us, and why does vote counting mislead? ::: ```{r} #| context: server library(shiny) library(ggplot2) library(dplyr) library(metafor) pal_black <- "#000505" pal_indigo <- "#3B3355" pal_grape <- "#5D5D81" pal_sky <- "#BFCDE0" pal_white <- "#FEFCFD" col_primary <- pal_indigo col_secondary <- pal_grape col_muted <- pal_sky col_dark <- pal_black col_light <- pal_white col_secondary_soft <- grDevices::adjustcolor(col_secondary, alpha.f = 0.78) theme_set( theme_minimal(base_size = 12) + theme( plot.title.position = "plot", plot.title = element_text(face = "bold", color = col_primary), plot.subtitle = element_text(color = col_secondary), axis.title = element_text(color = col_dark), axis.text = element_text(color = col_dark), legend.title = element_text(color = col_dark), legend.text = element_text(color = col_dark), panel.grid.minor = element_blank(), panel.grid.major = element_line(color = grDevices::adjustcolor(col_muted, alpha.f = 0.55)), plot.background = element_rect(fill = col_light, color = NA), panel.background = element_rect(fill = col_light, color = NA), legend.background = element_rect(fill = col_light, color = NA), legend.key = element_rect(fill = col_light, color = NA) ) ) safe_n <- function(x, min_n = 6) { max(min_n, as.integer(round(x))) } calc_smd <- function(a, b) { es <- metafor::escalc( measure = "SMD", m1i = mean(a), sd1i = sd(a), n1i = length(a), m2i = mean(b), sd2i = sd(b), n2i = length(b) ) c(yi = as.numeric(es$yi), vi = as.numeric(es$vi)) } simulate_p_hacking <- function( B = 1500, n0 = 40, max_extra = 40, trim_sd = 2, use_one_sided = TRUE, use_trim = TRUE, use_rank = TRUE, use_peeking = TRUE, alpha = 0.05 ) { p_values <- numeric(B) methods <- character(B) for (i in seq_len(B)) { x <- rnorm(n0) y <- rnorm(n0) p <- t.test(x, y, var.equal = TRUE)$p.value best_p <- p best_method <- "Classical t test" if (p < alpha) { p_values[i] <- p methods[i] <- best_method next } if (isTRUE(use_one_sided)) { p_one <- min( t.test(x, y, alternative = "less")$p.value, t.test(x, y, alternative = "greater")$p.value ) if (p_one < best_p) { best_p <- p_one best_method <- "One-sided test" } if (p_one < alpha) { p_values[i] <- p_one methods[i] <- "One-sided test" next } } if (isTRUE(use_trim)) { x_trim <- x[abs(scale(x)) < trim_sd] y_trim <- y[abs(scale(y)) < trim_sd] if (length(x_trim) > 5 && length(y_trim) > 5) { p_trim <- t.test(x_trim, y_trim, var.equal = TRUE)$p.value if (p_trim < best_p) { best_p <- p_trim best_method <- "Outlier removal" } if (p_trim < alpha) { p_values[i] <- p_trim methods[i] <- "Outlier removal" next } } } if (isTRUE(use_rank)) { p_rank <- suppressWarnings(wilcox.test(x, y, exact = FALSE)$p.value) if (!is.na(p_rank) && p_rank < best_p) { best_p <- p_rank best_method <- "Wilcoxon test" } if (!is.na(p_rank) && p_rank < alpha) { p_values[i] <- p_rank methods[i] <- "Wilcoxon test" next } } if (isTRUE(use_peeking) && max_extra > 0) { add_each <- 2 reps <- floor(max_extra / add_each) for (step in seq_len(reps)) { x <- c(x, rnorm(add_each)) y <- c(y, rnorm(add_each)) p_seq <- t.test(x, y, var.equal = TRUE)$p.value if (p_seq < best_p) { best_p <- p_seq best_method <- "Sequentially adding N" } if (p_seq < alpha) { p_values[i] <- p_seq methods[i] <- "Sequentially adding N" break } } if (methods[i] != "") { next } } p_values[i] <- best_p methods[i] <- best_method } data.frame( p_value = p_values, method = methods, significant = p_values < alpha ) } simulate_pcurve_studies <- function( B = 2000, n0 = 40, effect = 0.25, var_effect = 0.00, p_hack_prob = 0.4, hack_tries = 12, alpha = 0.05 ) { out <- vector("list", B) for (i in seq_len(B)) { n <- safe_n(rnorm(1, n0, sqrt(n0 / 3)), min_n = 8) delta <- rnorm(1, effect, sqrt(var_effect)) x <- rnorm(n) y <- rnorm(n, mean = delta) p_classic <- t.test(x, y, var.equal = TRUE)$p.value p_final <- p_classic hacked <- runif(1) < p_hack_prob tries <- max(0, as.integer(round(hack_tries))) # Simplified model: in "hacked" studies we try a few extra analyses # and report the significant result that lands closest to the 0.05 threshold. if (hacked && p_classic >= alpha && tries > 0) { p_candidates <- replicate(tries, { t.test(rnorm(n), rnorm(n, mean = delta), var.equal = TRUE)$p.value }) sig_candidates <- p_candidates[p_candidates < alpha] if (length(sig_candidates) > 0) { p_final <- max(sig_candidates) } } out[[i]] <- data.frame( p_value = p_final, significant = p_final < alpha, source = ifelse(hacked, "Hacked analysis", "Standard analysis") ) } bind_rows(out) } simulate_publication_bias <- function( B = 400, n0 = 50, effect = 0.25, var_effect = 0.10, var_n = 150, alpha = 0.05 ) { out <- vector("list", B) for (i in seq_len(B)) { n <- safe_n(rnorm(1, n0, sqrt(var_n)), min_n = 12) delta <- rnorm(1, effect, sqrt(var_effect)) control <- rnorm(n) treat <- rnorm(n, mean = delta) p_value <- t.test(control, treat, var.equal = TRUE)$p.value es <- calc_smd(treat, control) out[[i]] <- data.frame( study_id = i, yi = es["yi"], vi = es["vi"], se = sqrt(es["vi"]), p_value = p_value, published = p_value < alpha ) } bind_rows(out) } p_fmt <- function(x) { ifelse(x < 0.001, "< 0.001", sprintf("%.3f", x)) } cohen_components <- function(group_treat, group_control) { n1 <- length(group_treat) n2 <- length(group_control) m1 <- mean(group_treat) m2 <- mean(group_control) s1 <- sd(group_treat) s2 <- sd(group_control) s_within <- sqrt(((n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)) d <- (m1 - m2) / s_within vd <- (n1 + n2) / (n1 * n2) + (d^2) / (2 * (n1 + n2)) se <- sqrt(vd) data.frame( n1 = n1, n2 = n2, m1 = m1, m2 = m2, s1 = s1, s2 = s2, s_within = s_within, d = d, vd = vd, se = se, ci_low = d - 1.96 * se, ci_high = d + 1.96 * se ) } simulate_vote_meta <- function( k = 25, n_min = 24, n_max = 80, effect_mean = 0.20, tau = 0.12, alpha = 0.05 ) { out <- vector("list", k) for (i in seq_len(k)) { n1 <- sample(n_min:n_max, 1) n2 <- sample(n_min:n_max, 1) true_d <- rnorm(1, effect_mean, tau) control <- rnorm(n1, 0, 1) treat <- rnorm(n2, true_d, 1) test_p <- t.test(treat, control, var.equal = TRUE)$p.value comp <- cohen_components(treat, control) out[[i]] <- data.frame( study_id = i, yi = comp$d, vi = comp$vd, se = comp$se, p_value = test_p, significant = test_p < alpha ) } bind_rows(out) } pooling_summary <- function(df, tau2 = NULL) { w_fixed <- 1 / df$vi fixed_est <- sum(w_fixed * df$yi) / sum(w_fixed) fixed_se <- sqrt(1 / sum(w_fixed)) if (is.null(tau2)) { q_stat <- sum(w_fixed * (df$yi - fixed_est)^2) c_val <- sum(w_fixed) - (sum(w_fixed^2) / sum(w_fixed)) tau2 <- max((q_stat - (nrow(df) - 1)) / c_val, 0) } w_random <- 1 / (df$vi + tau2) random_est <- sum(w_random * df$yi) / sum(w_random) random_se <- sqrt(1 / sum(w_random)) list( tau2 = tau2, fixed_est = fixed_est, fixed_se = fixed_se, random_est = random_est, random_se = random_se, w_fixed = w_fixed, w_random = w_random ) } or_to_d <- function(a, b, c, d_cell) { odds_ratio <- (a * d_cell) / (b * c) log_or <- log(odds_ratio) d_cohen <- log_or * sqrt(3) / pi var_log_or <- 1 / a + 1 / b + 1 / c + 1 / d_cell var_d <- var_log_or * (3 / (pi^2)) se_d <- sqrt(var_d) data.frame( odds_ratio = odds_ratio, log_or = log_or, d = d_cohen, var_d = var_d, se_d = se_d, ci_low = d_cohen - 1.96 * se_d, ci_high = d_cohen + 1.96 * se_d ) } ``` ## What can meta-analysis tell us about the phenomenon of interest? 1. Does the effect actually exist, and if so, how large is it and with how much uncertainty can it be estimated? 2. Are the results across studies consistent enough to suggest a common pattern, or do they instead point to substantial heterogeneity? 3. Which study features, populations, or procedures might systematically influence the size of the observed effect? 4. Do some results depart from the rest in a way that calls for additional methodological or theoretical explanation? 5. Could the picture of the literature be distorted by *publication bias* or other forms of result selection? ## How does meta-analysis work? 1. The starting point is a systematic collection of studies relevant to the phenomenon, using explicit inclusion and exclusion criteria. 2. For each study, we compute a comparable effect-size measure together with the uncertainty of that estimate. 3. We then combine the individual study results into one pooled estimate, giving greater weight to more precise studies. 4. The result is a pooled effect together with its uncertainty and an estimate of between-study variation. ## Meta-analysis versus a systematic review 1. Even in systematic reviews, one still encounters *vote counting*: tallying how many studies were significant and how many were not. 2. That procedure rests on a mistaken interpretation of *p* values and can therefore lead to poor conclusions. 3. If many underpowered studies point in the same direction but miss the significance cutoff, vote counting can miss the pattern; meta-analysis can still recover it by pooling the estimates and weighting them by precision. ## Application: vote counting versus meta-analysis {.app-slide} ::: {.columns} ::: {.column width="34%"} ::: {.app-controls} ```{r} shiny::sliderInput("vote_k", "Number of studies", min = 8, max = 70, value = 24, step = 1) shiny::sliderInput("vote_n_min", "Minimum N per study", min = 12, max = 120, value = 24, step = 2) shiny::sliderInput("vote_n_max", "Maximum N per study", min = 20, max = 220, value = 80, step = 2) shiny::sliderInput("vote_eff", "Mean true effect (d)", min = 0, max = 0.9, value = 0.20, step = 0.05) shiny::sliderInput("vote_tau", "True-effect heterogeneity", min = 0, max = 0.6, value = 0.12, step = 0.02) shiny::sliderInput("vote_alpha", "Significance threshold", min = 0.01, max = 0.10, value = 0.05, step = 0.005) shiny::actionButton("vote_run", "Draw a new study set", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("vote_sig_plot", height = "140px") shiny::plotOutput("vote_forest_plot", height = "385px") shiny::uiOutput("vote_summary") ``` ::: ::: ::: ```{r} #| context: server observeEvent(input$vote_n_min, { shiny::updateSliderInput( session, "vote_n_max", min = input$vote_n_min + 2, value = max(input$vote_n_min + 2, input$vote_n_max) ) }) vote_meta_data <- eventReactive(input$vote_run, { simulate_vote_meta( k = input$vote_k, n_min = input$vote_n_min, n_max = input$vote_n_max, effect_mean = input$vote_eff, tau = input$vote_tau, alpha = input$vote_alpha ) }, ignoreNULL = FALSE) output$vote_sig_plot <- renderPlot({ df <- vote_meta_data() vote_n <- nrow(df) plot_df <- data.frame( status = factor(c("p < alpha", "p >= alpha"), levels = c("p < alpha", "p >= alpha")), n = c(sum(df$significant), sum(!df$significant)) ) %>% mutate( share = n / vote_n, label = ifelse(n > 0, sprintf("%d (%.0f%%)", n, 100 * share), "") ) ggplot(plot_df, aes(x = "Studies", y = share, fill = status)) + geom_col(width = 0.42) + geom_text( aes(label = label, color = status), position = position_stack(vjust = 0.5), fontface = "bold", size = 4.1, show.legend = FALSE ) + coord_flip() + scale_fill_manual(values = c("p < alpha" = col_primary, "p >= alpha" = col_muted)) + scale_color_manual(values = c("p < alpha" = col_light, "p >= alpha" = col_dark)) + scale_y_continuous( limits = c(0, 1), expand = c(0, 0), labels = function(x) paste0(round(100 * x), "%") ) + labs( title = "Vote counting: what proportion of studies crossed the significance threshold?", x = NULL, y = "Proportion of studies", fill = NULL ) + theme( legend.position = "top", legend.direction = "horizontal", legend.text = element_text(size = 9), legend.key.height = grid::unit(0.3, "cm"), legend.key.width = grid::unit(0.6, "cm"), legend.spacing.x = grid::unit(0.18, "cm"), axis.text.y = element_blank(), axis.ticks.y = element_blank() ) }) output$vote_forest_plot <- renderPlot({ df <- vote_meta_data() %>% arrange(yi) fit_re <- try(metafor::rma(yi, vi, data = df, method = "REML"), silent = TRUE) if (inherits(fit_re, "try-error")) { plot.new() title("Could not estimate the model for the forest plot") return(invisible()) } cex_val <- if (nrow(df) <= 20) { 0.75 } else if (nrow(df) <= 35) { 0.62 } else if (nrow(df) <= 50) { 0.52 } else { 0.44 } op <- par(no.readonly = TRUE) on.exit(par(op), add = TRUE) par(mar = c(3.6, 4.2, 2.2, 1.0)) metafor::forest( fit_re, slab = paste("Study", df$study_id), xlab = "Effect size (Cohen's d)", header = c("Study", "d [95% CI]"), refline = 0, showweights = TRUE, cex = cex_val ) }) output$vote_summary <- renderUI({ df <- vote_meta_data() vote_n <- nrow(df) vote_sig_n <- sum(df$significant) vote_share <- mean(df$significant) fit_fe <- try(metafor::rma(yi, vi, data = df, method = "FE"), silent = TRUE) fit_re <- try(metafor::rma(yi, vi, data = df, method = "REML"), silent = TRUE) if (inherits(fit_fe, "try-error") || inherits(fit_re, "try-error")) { return(shiny::div(class = "metric-box", "Could not estimate the meta-analytic model for the current data.")) } re_est <- as.numeric(fit_re$b[1, 1]) re_low <- fit_re$ci.lb re_high <- fit_re$ci.ub fe_est <- as.numeric(fit_fe$b[1, 1]) ci_excludes_zero <- (re_low > 0) || (re_high < 0) vote_msg <- if (vote_share >= 0.5) { "most studies cross the significance threshold" } else { "most studies do not cross the significance threshold" } meta_msg <- if (ci_excludes_zero) { "meta-analysis indicates an effect different from zero (95% CI excludes 0)" } else { "meta-analysis does not justify rejecting the null effect (95% CI includes 0)" } contrast_msg <- if (vote_share < 0.5 && ci_excludes_zero) { "Here vote counting misses the pattern, but meta-analysis recovers it by pooling the estimates." } else if (vote_share >= 0.5 && !ci_excludes_zero) { "Here the situation is reversed: many individual p < alpha, but the pooled estimate remains uncertain." } else { "In this setting, vote counting and meta-analysis lead to a similar conclusion." } shiny::div( class = "metric-box", shiny::HTML(sprintf( "Significant studies: %s/%s (%.1f%%), that is, %s.
Pooled effect (fixed): %.3f
Pooled effect (random, REML): %.3f [%.3f, %.3f]
Meta-analytic conclusion: %s.
Comparison comment: %s", vote_sig_n, vote_n, 100 * vote_share, vote_msg, fe_est, re_est, re_low, re_high, meta_msg, contrast_msg )) ) }) ``` ## How do we measure effect size? {.divider-slide} ::: {.divider-lead} Before we combine results across studies, we need to put them on a common scale and understand what a one-unit effect actually means. ::: ## Effect size: Cohen's *d* Cohen's *d* expresses the group difference in standard-deviation units, so results from different scales can be compared: $$ d = \frac{\bar{X_1} - \bar{X_2}}{S_{within}} $$ $\bar{X_1} - \bar{X_2}$ is the difference in means between the two groups. In the denominator we use the pooled standard deviation: $$ S_{within} = \sqrt{\frac{(n_1 -1)S^2_1 + (n_2 - 1)S^2_2}{n_1 + n_2 - 2}} $$ This means that Cohen's *d* is the difference in means expressed in standard deviations, which is why it can be compared across studies. ## Application: draw one study and compute Cohen's *d* {.app-slide} ::: {.columns} ::: {.column width="34%"} ::: {.app-controls} ```{r} shiny::sliderInput("study_n1", "N in the experimental group", min = 12, max = 250, value = 50, step = 2) shiny::sliderInput("study_n2", "N in the control group", min = 12, max = 250, value = 50, step = 2) shiny::sliderInput("study_true_d", "True effect (d)", min = -0.8, max = 1.2, value = 0.35, step = 0.05) shiny::sliderInput("study_sd", "Standard deviation", min = 0.5, max = 3.0, value = 1.0, step = 0.1) shiny::actionButton("study_draw", "Draw a new study", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("study_d_plot", height = "450px") shiny::uiOutput("study_d_summary") ``` ::: ::: ::: ```{r} #| context: server single_study <- eventReactive(input$study_draw, { control <- rnorm(input$study_n2, mean = 0, sd = input$study_sd) treat <- rnorm(input$study_n1, mean = input$study_true_d * input$study_sd, sd = input$study_sd) comp <- cohen_components(treat, control) df <- bind_rows( data.frame(group = "Control group", outcome = control), data.frame(group = "Experimental group", outcome = treat) ) %>% mutate(group = factor(group, levels = c("Control group", "Experimental group"))) list(data = df, comp = comp) }, ignoreNULL = FALSE) output$study_d_plot <- renderPlot({ df <- single_study()$data ggplot(df, aes(group, outcome, fill = group)) + geom_violin(alpha = 0.28, width = 0.82, color = NA, trim = FALSE) + geom_boxplot(width = 0.22, outlier.alpha = 0.18, alpha = 0.55) + geom_jitter(width = 0.08, alpha = 0.22, size = 1.2) + stat_summary(fun = mean, geom = "point", size = 4, color = col_dark) + scale_fill_manual(values = c("Control group" = col_muted, "Experimental group" = col_secondary)) + labs( title = "Random data from one study: control group and experimental group", x = NULL, y = "Outcome", fill = NULL ) + theme(legend.position = "top") }) output$study_d_summary <- renderUI({ c <- single_study()$comp shiny::div( class = "metric-box", shiny::HTML(sprintf( "Means: experimental = %.3f, control = %.3f
S_within: %.3f
Cohen's d: %.3f
SE(d): %.3f
95%% CI: [%.3f, %.3f]", c$m1, c$m2, c$s_within, c$d, c$se, c$ci_low, c$ci_high )) ) }) ``` ## How do we combine results across studies? {.divider-slide} ::: {.divider-lead} The pooled estimate depends on study precision, the weighting rule, and the model. ::: ## The variance of *d* and study precision The sampling variance of `d` is given by: $$ V_d = \frac{n_1 + n_2}{n_1 n_2} + \frac{d^2}{2(n_1 + n_2)} $$ Larger samples give smaller sampling variance, so those studies receive more weight in the meta-analysis. From $V_d$ we also obtain the standard error ($SE_d = \sqrt(V_d)$) and the confidence interval ($d \pm 1.96 \times SE_d$). ## Meta-analysis: the fixed-effect model In the simplest model, we assume that all studies estimate one common "true" effect. Each study is assigned a weight: $$ W_i = \frac{1}{V_i} $$ The pooled effect is then a weighted mean: $$ M = \frac{\sum_{i=1}^{k} W_i Y_i}{\sum_{i=1}^{k} W_i} $$ The variance of that estimate is: $$ V_M = \frac{1}{\sum_{i=1}^{k} W_i} $$ In practice, this means that more precise studies, that is, studies with smaller variance, have greater influence on the final result. ## Meta-analysis: the random-effects model In the random-effects model we do not assume a single point value for the effect. We allow true effects to differ across studies, so we add a between-study component ($\tau^2$) to the variance: $$ W_i^{RE} = \frac{1}{V_i + \tau^2} $$ Observed differences now reflect both sampling error and real between-study heterogeneity, so the weights become less extreme than under a fixed-effect model. ## Application: study weights and model choice {.app-slide} ::: {.columns} ::: {.column width="30%"} ::: {.app-controls} ```{r} shiny::sliderInput("model_k", "Number of studies", min = 6, max = 40, value = 16, step = 1) shiny::sliderInput("model_n_min", "Minimum N", min = 12, max = 120, value = 25, step = 1) shiny::sliderInput("model_n_max", "Maximum N", min = 20, max = 220, value = 90, step = 1) shiny::sliderInput("model_mu", "Mean effect in the population", min = -0.2, max = 0.9, value = 0.22, step = 0.02) shiny::sliderInput("model_tau_true", "True heterogeneity (tau)", min = 0, max = 0.6, value = 0.18, step = 0.02) shiny::actionButton("model_run", "Recompute models", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("model_weight_plot", height = "280px") shiny::plotOutput("model_pool_plot", height = "190px") shiny::uiOutput("model_summary") ``` ::: ::: ::: ```{r} #| context: server observeEvent(input$model_n_min, { shiny::updateSliderInput( session, "model_n_max", min = input$model_n_min + 1, value = max(input$model_n_max, input$model_n_min + 1) ) }) model_data <- eventReactive(input$model_run, { simulate_vote_meta( k = input$model_k, n_min = input$model_n_min, n_max = input$model_n_max, effect_mean = input$model_mu, tau = input$model_tau_true, alpha = 0.05 ) }, ignoreNULL = FALSE) output$model_weight_plot <- renderPlot({ df <- model_data() pooled <- pooling_summary(df) wdf <- bind_rows( data.frame(study_id = factor(df$study_id), weight = pooled$w_fixed, model = "Fixed"), data.frame(study_id = factor(df$study_id), weight = pooled$w_random, model = "Random") ) ggplot(wdf, aes(study_id, weight, fill = model)) + geom_col(position = "dodge", width = 0.72) + scale_fill_manual(values = c("Fixed" = col_primary, "Random" = col_secondary)) + labs( title = "Study weights: fixed versus random", x = "Study", y = "Weight", fill = "Model" ) + theme( axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "top" ) }) output$model_pool_plot <- renderPlot({ df <- model_data() pooled <- pooling_summary(df) sum_df <- data.frame( model = c("Fixed effects", "Random effects"), est = c(pooled$fixed_est, pooled$random_est), low = c(pooled$fixed_est - 1.96 * pooled$fixed_se, pooled$random_est - 1.96 * pooled$random_se), high = c(pooled$fixed_est + 1.96 * pooled$fixed_se, pooled$random_est + 1.96 * pooled$random_se) ) ggplot(sum_df, aes(est, model, color = model)) + geom_vline(xintercept = 0, linetype = "dashed", color = col_muted) + geom_errorbarh(aes(xmin = low, xmax = high), height = 0.15, linewidth = 1.4) + geom_point(size = 3.7) + scale_color_manual(values = c("Fixed effects" = col_primary, "Random effects" = col_secondary)) + labs( title = "Pooled estimate and 95% CI under both models", x = "Effect size", y = NULL, color = NULL ) + theme(legend.position = "none") }) output$model_summary <- renderUI({ df <- model_data() pooled <- pooling_summary(df) fit_fe <- metafor::rma(yi, vi, data = df, method = "FE") fit_re <- metafor::rma(yi, vi, data = df, method = "DL") shiny::div( class = "metric-box", shiny::HTML(sprintf( "Fixed effect: %.3f (SE = %.3f)
Random effect: %.3f (SE = %.3f)
Q: %.2f, I²: %.1f%%
Estimated tau² (DL): %.3f | True tau² in the simulation: %.3f", pooled$fixed_est, pooled$fixed_se, pooled$random_est, pooled$random_se, fit_fe$QE, fit_re$I2, fit_re$tau2, input$model_tau_true^2 )) ) }) ``` ## What do we do when the data are not ideal? {.divider-slide} ::: {.divider-lead} In real meta-analyses, the data are rarely complete or homogeneous. ::: ## Binary outcomes: from the odds ratio to *d* Not all studies report means and standard deviations. With a binary outcome (yes/no; illness/recovery), we can instead compute an odds ratio: $$ OR = \frac{A \times D}{B \times C} $$ We can then convert it to the scale of Cohen's `d`: $$ d = \ln(OR) \times \frac{\sqrt{3}}{\pi} $$ For completeness, the variance is then defined as: $$ V_d = V_{\ln OR} \times \frac{3}{\pi^2} $$ $$ V_{\ln OR} = \frac{1}{A} + \frac{1}{B} + \frac{1}{C} + \frac{1}{D} $$ This allows binary outcomes to be expressed on the same metric as continuous outcomes. ## Application: convert $OR \rightarrow d$ {.app-slide} ::: {.columns} ::: {.column width="30%"} ::: {.app-controls} ```{r} shiny::sliderInput("or_a", "A: Group 1 / YES response", min = 5, max = 300, value = 80, step = 1) shiny::sliderInput("or_b", "B: Group 1 / NO response", min = 5, max = 300, value = 20, step = 1) shiny::sliderInput("or_c", "C: Group 2 / YES response", min = 5, max = 300, value = 20, step = 1) shiny::sliderInput("or_d", "D: Group 2 / NO response", min = 5, max = 300, value = 80, step = 1) ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("or_heatmap", height = "305px") shiny::plotOutput("or_effect_plot", height = "185px") shiny::uiOutput("or_summary") ``` ::: ::: ::: ```{r} #| context: server or_metrics <- reactive({ or_to_d( a = input$or_a, b = input$or_b, c = input$or_c, d_cell = input$or_d ) }) output$or_heatmap <- renderPlot({ tab <- data.frame( group = rep(c("Group 1", "Group 2"), each = 2), response = rep(c("YES", "NO"), times = 2), n = c(input$or_a, input$or_b, input$or_c, input$or_d) ) ggplot(tab, aes(response, group, fill = n)) + geom_tile(color = col_light, linewidth = 1) + geom_text(aes(label = n), color = col_light, fontface = "bold", size = 7) + scale_fill_gradient(low = col_muted, high = col_primary) + labs( title = "2x2 table: raw counts", x = "Response", y = NULL, fill = "N" ) + theme(legend.position = "top") }) output$or_effect_plot <- renderPlot({ m <- or_metrics() df <- data.frame( est = m$d, low = m$ci_low, high = m$ci_high, typ = "Converted effect" ) ggplot(df, aes(est, typ)) + geom_vline(xintercept = 0, linetype = "dashed", color = col_muted) + geom_errorbarh(aes(xmin = low, xmax = high), height = 0.15, linewidth = 1.2, color = col_primary) + geom_point(size = 3.8, color = col_primary) + labs( title = "Conversion to Cohen's d scale", x = "d", y = NULL ) }) output$or_summary <- renderUI({ m <- or_metrics() shiny::div( class = "metric-box", shiny::HTML(sprintf( "OR: %.3f
log(OR): %.3f
d Cohena: %.3f
SE(d): %.3f
95%% CI: [%.3f, %.3f]", m$odds_ratio, m$log_or, m$d, m$se_d, m$ci_low, m$ci_high )) ) }) ``` ## What if data are missing? Articles do not always report all the information needed for basic calculations. In practice, one then relies on a few strategies: algebraic conversions, approximations used under incomplete reporting, and explicit coding of which values were read directly and which were imputed. Typical relationships include: $$ d = t \sqrt{\frac{n_1 + n_2}{n_1 n_2}} $$ $$ d \approx \frac{2t}{\sqrt{N}} $$ Imputation remains a supporting tool, not a repair of the data. Transparency in coding and sensitivity analysis therefore remain essential. ## Heterogeneity: $Q$, $I^2$, and meta-regression After pooling effects, we ask whether there is evidence of heterogeneity in the true effect sizes and what share of the observed dispersion is real. We also ask what consequences that heterogeneity has for the pooled estimate, and whether study-level variables can explain differences among effects. The *Q* statistic together with its *p* value tests whether the observed variability exceeds the level expected under a common effect, whereas $I^2$ describes what proportion of the dispersion is real. Meta-regression then allows us to analyse relations between study characteristics and effect size. At this point, meta-analysis is no longer only a weighted average; it also becomes a way to explain why study results differ. ## Where do statistical artifacts come from? {.divider-slide} ::: {.divider-lead} Before a problem becomes a meta-analytic problem, it usually begins earlier: in the way single-study results are tested, reported, and interpreted. ::: ## Why begin with hypothesis testing? 1. Hypothesis testing is the simplest entry point to the question of whether an observed result could have arisen by chance. 2. Meta-analysis inherits the consequences of the decisions made in individual studies. 3. If we misinterpret *p* values, we will also misread the biases present in the literature. 4. It is at the level of the single test that later meta-analytic artifacts arise: result selection, overinterpretation, and *p-hacking*. 5. That is why a critical reading of meta-analyses begins with the logic of testing. ## What does the app show? 1. The app plots the binomial distribution for the chosen null model. 2. It shows how the *p* value changes as you vary the number of trials, the number of successes, and the test direction. 3. It also marks which outcomes are counted into the *p* value. 4. Use it to see why the same data can give different *p* values under one-sided and two-sided tests. ## How does the mechanism work? 1. First we define a chance model in which success occurs with probability $p_0$. 2. Next we compute the probability of every possible outcome. 3. We then sum the appropriate tail or tails of the distribution depending on the test variant. 4. The resulting *p* value is a measure of how compatible the data are with H0. ## Application: hypothesis testing {.app-slide} ::: {.columns} ::: {.column width="30%"} ::: {.app-controls} ```{r} shiny::sliderInput("bond_n", "Number of trials", min = 8, max = 80, value = 16, step = 1) shiny::sliderInput("bond_k", "Number of successes", min = 0, max = 16, value = 12, step = 1) shiny::sliderInput("bond_p0", "P(success) under H0", min = 0.10, max = 0.90, value = 0.50, step = 0.05) shiny::selectInput( "bond_tail", "Test variant", choices = c("One-sided (>)" = "greater", "Two-sided" = "two.sided"), selected = "greater" ) ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("bond_plot", height = "430px") shiny::uiOutput("bond_summary") ``` ::: ::: ::: ```{r} #| context: server observeEvent(input$bond_n, { shiny::updateSliderInput( session, "bond_k", max = input$bond_n, value = min(input$bond_k, input$bond_n) ) }) output$bond_plot <- renderPlot({ n <- input$bond_n k <- input$bond_k p0 <- input$bond_p0 tail <- input$bond_tail x_vals <- 0:n probs <- dbinom(x_vals, size = n, prob = p0) obs_prob <- dbinom(k, size = n, prob = p0) selected <- if (tail == "greater") { x_vals >= k } else { probs <= obs_prob + 1e-12 } df <- data.frame( successes = x_vals, probability = probs, region = ifelse(selected, "Counted into p-value", "Outside p-value") ) ggplot(df, aes(successes, probability, fill = region)) + geom_col(width = 0.85, color = col_light, linewidth = 0.2) + geom_vline(xintercept = k, linetype = "dashed", linewidth = 1, color = col_dark) + scale_fill_manual(values = c("Counted into p-value" = col_primary, "Outside p-value" = col_muted)) + labs( x = "Number of successes", y = "Probability under H0", fill = NULL, title = "Binomial distribution and the p-value region" ) + theme(legend.position = "top") }) output$bond_summary <- renderUI({ n <- input$bond_n k <- input$bond_k p0 <- input$bond_p0 tail <- input$bond_tail bt <- binom.test(k, n, p = p0, alternative = tail) p_value <- bt$p.value verdict <- if (p_value < 0.05) { "The result is not very compatible with H0 (at the 0.05 level)." } else { "The result is sufficiently compatible with H0 (at the 0.05 level)." } shiny::div( class = "metric-box", shiny::HTML(sprintf( "Result: %s successes out of %s trials.
p-value: %s
Interpretation: %s", k, n, p_fmt(p_value), verdict )) ) }) ``` ## Why is *p-hacking* dangerous? 1. When a researcher tries many analytical paths, the probability of obtaining a chance significant result increases. 2. In the paper, we usually see only the final version, not the full history of analytic attempts. 3. As a result, the proportion of false positives increases even though each individual decision may look harmless. 4. When such results enter meta-analysis as independent pieces of evidence, the pooled estimate can become systematically inflated. ## What does the app show? 1. The app simulates studies with no true effect and records the final reported *p* value. 2. You can compare several researcher degrees of freedom: one-sided tests, outlier removal, Wilcoxon tests, and optional stopping. 3. You can also change starting N, extra sampling, and the number of simulations. 4. As more analytic tries are allowed, *p* values pile up just below 0.05. ## How does the simulation work? 1. We generate two groups with no true effect. 2. We then run several result-"rescuing" strategies one after another. 3. We store the final *p* value together with the method that produced it. 4. At the end, we inspect the distribution of outcomes and the proportion treated as significant. ## What do these manipulations mean? 1. **One-sided test** means choosing the direction that yields the smaller *p* value instead of using a two-sided test. 2. **Outlier exclusion** means removing extreme values and recomputing the test on the "cleaned" data. 3. **Wilcoxon test** means switching to a nonparametric procedure in the hope that it will be more favorable. 4. **Optional stopping** means adding participants in batches and checking after each batch whether $p < 0.05$ has already appeared. 5. **Final *p* value** means the best result obtained across many analytic attempts. ## Application: *p-hacking* {.app-slide} ::: {.columns} ::: {.column width="30%"} ::: {.app-controls} ```{r} shiny::sliderInput("hack_B", "Number of simulations", min = 300, max = 5000, value = 1200, step = 100) shiny::sliderInput("hack_n0", "Starting N per group", min = 12, max = 120, value = 40, step = 2) shiny::sliderInput("hack_extra", "Max extra N (optional stopping)", min = 0, max = 120, value = 40, step = 2) shiny::sliderInput("hack_trim", "Outlier-removal threshold (SD)", min = 1.2, max = 3.5, value = 2, step = 0.1) shiny::checkboxGroupInput( "hack_methods", "Available strategies", choices = c( "Switch to one-sided test" = "one", "Exclude outliers" = "trim", "Use Wilcoxon test" = "rank", "Optional stopping" = "peek" ), selected = c("one", "trim", "rank", "peek"), inline = TRUE ) shiny::actionButton("hack_run", "Recompute simulation", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("hack_hist_all", height = "300px") shiny::plotOutput("hack_hist_sig", height = "240px") shiny::uiOutput("hack_metrics") ``` ::: ::: ::: ```{r} #| context: server hack_data <- eventReactive(input$hack_run, { simulate_p_hacking( B = input$hack_B, n0 = input$hack_n0, max_extra = input$hack_extra, trim_sd = input$hack_trim, use_one_sided = "one" %in% input$hack_methods, use_trim = "trim" %in% input$hack_methods, use_rank = "rank" %in% input$hack_methods, use_peeking = "peek" %in% input$hack_methods ) }, ignoreNULL = FALSE) output$hack_hist_all <- renderPlot({ df <- hack_data() ggplot(df, aes(p_value)) + geom_histogram(bins = 45, fill = col_secondary, color = col_light, alpha = 0.9) + geom_vline(xintercept = 0.05, linetype = "dashed", color = col_primary, linewidth = 1.1) + coord_cartesian(xlim = c(0, 1)) + labs( title = "Final p-values (full distribution)", x = "p-value", y = "Number of simulations" ) }) output$hack_hist_sig <- renderPlot({ df <- hack_data() %>% filter(p_value < 0.05) if (nrow(df) < 5) { plot.new() title("Too few results with p < 0.05 for a histogram") return(invisible()) } ggplot(df, aes(p_value, fill = method)) + geom_histogram(bins = 20, color = col_light, alpha = 0.9) + scale_fill_manual(values = c( "Classical t test" = col_muted, "One-sided test" = col_secondary_soft, "Outlier removal" = col_secondary, "Wilcoxon test" = col_primary, "Sequentially adding N" = col_dark )) + labs( title = "Distribution of p < 0.05 and dominant strategies", x = "p-value", y = "Number of results" ) + theme(legend.position = "top") }) output$hack_metrics <- renderUI({ df <- hack_data() alpha <- 0.05 fp_rate <- mean(df$significant) top_method <- df %>% count(method, sort = TRUE) %>% slice(1) shiny::div( class = "metric-box", shiny::HTML(sprintf( "False positives: %.1f%% (against a nominal 5%%)
Most often ends with: %s (%.1f%% of cases)", 100 * fp_rate, top_method$method, 100 * top_method$n / nrow(df) )) ) }) ``` ## Why do we look at *p-curve*? 1. The fact that results are statistically significant does not by itself tell us whether they are credible. 2. *p-curve* analyses the shape of the distribution of significant *p* values in the range from $0$ to $0.05$. 3. The tool asks not only how many results are significant, but also where the *p* values cluster. 4. Treat *p-curve* as one diagnostic among several, not as a standalone verdict on the literature. ## What does the app show? 1. The app shows how the *p-curve* changes with effect size and with the share of hacked studies. 2. It also reports the share of significant results below 0.025. 3. Mixed literatures produce intermediate shapes, and those are usually the hardest to interpret. 4. Sample size and optional stopping affect both the number of significant results and the shape of the curve. ## How should we read the plot? 1. A **right-skewed** *p-curve*, with many very small *p* values, suggests evidential value. 2. A **flat** *p-curve* can be consistent with no effect or with very weakly informative data. 3. A **pile-up just below 0.05** is a warning signal of possible analytic artifacts, including *p-hacking*. ## Application: *p-curve* {.app-slide} ::: {.columns} ::: {.column width="30%"} ::: {.app-controls} ```{r} shiny::sliderInput("curve_B", "Number of studies", min = 300, max = 6000, value = 1400, step = 100) shiny::sliderInput("curve_n0", "Typical N per group", min = 12, max = 150, value = 40, step = 2) shiny::sliderInput("curve_eff", "True effect (d)", min = 0, max = 0.9, value = 0.25, step = 0.05) shiny::sliderInput("curve_var", "Between-study effect variance", min = 0, max = 0.35, value = 0.00, step = 0.01) shiny::sliderInput("curve_hack", "Share of hacked studies", min = 0, max = 1, value = 0.40, step = 0.05) shiny::sliderInput("curve_extra", "Number of extra analytic tries", min = 0, max = 80, value = 12, step = 1) shiny::actionButton("curve_run", "Recompute p-curve", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("curve_plot", height = "340px") shiny::plotOutput("curve_source_plot", height = "235px") shiny::uiOutput("curve_metrics") ``` ::: ::: ::: ```{r} #| context: server curve_data <- eventReactive(input$curve_run, { simulate_pcurve_studies( B = input$curve_B, n0 = input$curve_n0, effect = input$curve_eff, var_effect = input$curve_var, p_hack_prob = input$curve_hack, hack_tries = input$curve_extra ) }, ignoreNULL = FALSE) output$curve_plot <- renderPlot({ sig <- curve_data() %>% filter(significant) if (nrow(sig) < 20) { plot.new() title("Too few significant results for a stable p-curve") return(invisible()) } ggplot(sig, aes(p_value)) + geom_histogram( breaks = seq(0, 0.05, by = 0.005), fill = col_primary, color = col_light ) + geom_vline(xintercept = 0.025, linetype = "dashed", color = col_dark, linewidth = 1) + coord_cartesian(xlim = c(0, 0.05)) + labs( title = "p-curve: significant results only", subtitle = "Dashed line: 0.025 threshold", x = "p-value", y = "Number of studies" ) }) output$curve_source_plot <- renderPlot({ df <- curve_data() %>% mutate(sig_band = ifelse(significant, "p < 0.05", "p >= 0.05")) %>% count(source, sig_band) ggplot(df, aes(source, n, fill = sig_band)) + geom_col(position = "fill") + scale_y_continuous(labels = function(x) paste0(round(100 * x), "%")) + scale_fill_manual(values = c("p < 0.05" = col_primary, "p >= 0.05" = col_muted)) + labs( title = "Where do the significant results come from?", x = NULL, y = "Share", fill = NULL ) + theme(legend.position = "top") }) output$curve_metrics <- renderUI({ df <- curve_data() sig <- df %>% filter(significant) if (nrow(sig) == 0) { return(shiny::div(class = "metric-box", "No significant results - increase N or the effect size.")) } left_share <- mean(sig$p_value < 0.025) sig_rate <- mean(df$significant) diagnosis <- if (left_share > 0.58) { "Strong signal of evidential value (many very small p values)." } else if (left_share < 0.45) { "Possible analytical artifacts or a weak effect (too many p values close to 0.05)." } else { "Ambiguous situation - additional diagnostics are needed." } shiny::div( class = "metric-box", shiny::HTML(sprintf( "Significant results: %.1f%% of all studies
Share with p < 0.025: %.1f%%
Conclusion: %s", 100 * sig_rate, 100 * left_share, diagnosis )) ) }) ``` ## How can the literature distort the picture of an effect? {.divider-slide} ::: {.divider-lead} Even a correct synthesis can mislead if only a selected slice of all completed studies reaches publication. ::: ## Why does *publication bias* damage meta-analysis? 1. If mainly statistically significant findings are published, the literature stops being a random sample of all completed studies. 2. A meta-analysis based only on that set will usually "see" an effect that is inflated relative to the full data-generating process. 3. Publication selection acts like a filter: studies are most likely to enter the literature when they point in the expected direction and produce a small enough *p* value. 4. Even a properly conducted meta-analysis can then yield estimates that are precise yet systematically wrong. ## What does the app show? 1. The app compares all studies with the subset that would be published under a significance filter. 2. It shows how selection shifts the pooled estimate and how *trim-and-fill* responds. 3. It also shows how the funnel plot changes as selection pressure and heterogeneity increase. 4. This helps you see when the correction is informative and when it is too weak to recover the full bias. ## How does the simulation work? 1. We draw many studies under a controlled effect size and heterogeneity. 2. We then test each study and publish it only when $p < \alpha$. 3. At the end, we compare the pooled effect for the full set, the published subset, and the *trim-and-fill* correction. ## Application: *publication bias* {.app-slide} ::: {.columns} ::: {.column width="34%"} ::: {.app-controls} ```{r} shiny::sliderInput("pub_B", "Number of studies", min = 80, max = 1200, value = 320, step = 20) shiny::sliderInput("pub_n0", "Baseline N per group", min = 12, max = 180, value = 50, step = 2) shiny::sliderInput("pub_eff", "True effect (d)", min = 0, max = 1.0, value = 0.25, step = 0.05) shiny::sliderInput("pub_var_eff", "Between-study effect variance", min = 0, max = 0.50, value = 0.10, step = 0.01) shiny::sliderInput("pub_var_n", "Sample-size variance", min = 0, max = 500, value = 150, step = 10) shiny::sliderInput("pub_alpha", "Publication threshold", min = 0.01, max = 0.10, value = 0.05, step = 0.005) shiny::actionButton("pub_run", "Simulate the literature", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("pub_funnel", height = "335px") shiny::plotOutput("pub_density", height = "235px") shiny::uiOutput("pub_metrics") ``` ::: ::: ::: ```{r} #| context: server pub_data <- eventReactive(input$pub_run, { simulate_publication_bias( B = input$pub_B, n0 = input$pub_n0, effect = input$pub_eff, var_effect = input$pub_var_eff, var_n = input$pub_var_n, alpha = input$pub_alpha ) }, ignoreNULL = FALSE) output$pub_funnel <- renderPlot({ df <- pub_data() published <- df %>% filter(published) if (nrow(df) < 5 || nrow(published) < 5) { plot.new() title("Too little data for a readable funnel plot") return(invisible()) } fit_all <- try(metafor::rma(yi, vi, data = df, method = "REML"), silent = TRUE) fit_pub <- try(metafor::rma(yi, vi, data = published, method = "REML"), silent = TRUE) if (inherits(fit_all, "try-error") || inherits(fit_pub, "try-error")) { plot.new() title("Could not estimate the model for the funnel plot") return(invisible()) } xlim_common <- range(c(df$yi, published$yi), na.rm = TRUE) x_pad <- max(0.15, diff(xlim_common) * 0.12) xlim_common <- c(xlim_common[1] - x_pad, xlim_common[2] + x_pad) op <- par(no.readonly = TRUE) on.exit(par(op), add = TRUE) par( mfrow = c(1, 2), oma = c(0, 0, 0.8, 0), mar = c(3.8, 3.8, 2.3, 0.8), mgp = c(2.0, 0.6, 0), cex.axis = 0.78, cex.lab = 0.86, cex.main = 0.9 ) metafor::funnel( fit_all, yaxis = "sei", xlim = xlim_common, xlab = "Estimated effect (SMD)", ylab = "Standard error", main = "All studies", pch = 16, col = ifelse(df$published, col_primary, col_secondary), level = 95, addtau2 = FALSE, shade = c(col_light, col_muted), back = col_light ) legend( "topright", legend = c("Unpublished", "Published"), pch = 16, col = c(col_secondary, col_primary), bty = "n", cex = 0.66 ) metafor::funnel( fit_pub, yaxis = "sei", xlim = xlim_common, xlab = "Estimated effect (SMD)", ylab = "", main = "Published only", pch = 16, col = col_primary, level = 95, addtau2 = FALSE, shade = c(col_light, col_muted), back = col_light ) mtext("Funnel plot: full picture versus published literature", outer = TRUE, cex = 0.92, font = 2) }) output$pub_density <- renderPlot({ df <- pub_data() ggplot(df, aes(yi, fill = published)) + geom_density(alpha = 0.45) + scale_fill_manual(values = c("TRUE" = col_primary, "FALSE" = col_secondary)) + labs( title = "Effect distribution: published versus unpublished", x = "SMD", y = "Density", fill = "Published" ) + theme(legend.position = "top") }) output$pub_metrics <- renderUI({ df <- pub_data() published <- df %>% filter(published) if (nrow(df) < 5 || nrow(published) < 5) { return(shiny::div(class = "metric-box", "Too few published studies for a stable meta-analysis.")) } fit_all <- metafor::rma(yi, vi, data = df) fit_pub <- metafor::rma(yi, vi, data = published) tf_text <- "trim-and-fill unavailable (too little data)" tf_est <- NA_real_ tf_fit <- try(metafor::trimfill(fit_pub), silent = TRUE) if (!inherits(tf_fit, "try-error")) { tf_est <- as.numeric(tf_fit$b[1, 1]) tf_text <- sprintf("%.3f", tf_est) } est_all <- as.numeric(fit_all$b[1, 1]) est_pub <- as.numeric(fit_pub$b[1, 1]) inflation <- 100 * (est_pub - est_all) / ifelse(abs(est_all) < 1e-6, 1, abs(est_all)) shiny::div( class = "metric-box", shiny::HTML(sprintf( "Published: %.1f%% of studies
Pooled effect (all): %.3f
Pooled effect (published): %.3f
Effect inflation: %.1f%%
Trim-and-fill: %s", 100 * mean(df$published), est_all, est_pub, inflation, tf_text )) ) }) ``` ## How should we read *trim-and-fill*? 1. The method tries to estimate missing studies on the "trimmed" side of the funnel. 2. It produces an adjusted estimate, but it does not fix every form of bias. 3. Read it by comparing the published-only estimate with the adjusted estimate. ## Application: *trim-and-fill* {.app-slide} ::: {.columns} ::: {.column width="34%"} ::: {.app-controls} ```{r} shiny::sliderInput("tf_B", "Number of studies", min = 80, max = 1200, value = 320, step = 20) shiny::sliderInput("tf_n0", "Baseline N per group", min = 12, max = 180, value = 50, step = 2) shiny::sliderInput("tf_eff", "True effect (d)", min = 0, max = 1.0, value = 0.25, step = 0.05) shiny::sliderInput("tf_var_eff", "Between-study effect variance", min = 0, max = 0.50, value = 0.10, step = 0.01) shiny::sliderInput("tf_var_n", "Sample-size variance", min = 0, max = 500, value = 150, step = 10) shiny::sliderInput("tf_alpha", "Publication threshold", min = 0.01, max = 0.10, value = 0.05, step = 0.005) shiny::checkboxInput("tf_show_unpub", "Show unpublished studies (control)", value = FALSE) shiny::actionButton("tf_run", "Run trim-and-fill", class = "btn-accent") ``` ::: ::: ::: {.column width="66%"} ::: {.app-results} ```{r} shiny::plotOutput("tf_funnel_plot", height = "320px") shiny::plotOutput("tf_compare_plot", height = "210px") shiny::uiOutput("tf_metrics") ``` ::: ::: ::: ```{r} #| context: server tf_data <- eventReactive(input$tf_run, { simulate_publication_bias( B = input$tf_B, n0 = input$tf_n0, effect = input$tf_eff, var_effect = input$tf_var_eff, var_n = input$tf_var_n, alpha = input$tf_alpha ) }, ignoreNULL = FALSE) output$tf_funnel_plot <- renderPlot({ df <- tf_data() published <- df %>% filter(published) unpublished <- df %>% filter(!published) if (nrow(published) < 10) { plot.new() title("Too few published studies for trim-and-fill") return(invisible()) } fit_pub <- try(metafor::rma(yi, vi, data = published, method = "REML"), silent = TRUE) tf_fit <- try(metafor::trimfill(fit_pub), silent = TRUE) if (inherits(fit_pub, "try-error") || inherits(tf_fit, "try-error")) { plot.new() title("Trim-and-fill unavailable for the current parameters") return(invisible()) } observed_df <- published %>% transmute( yi = yi, sei = sqrt(vi), point_type = "Published" ) filled_df <- data.frame( yi = tf_fit$yi[tf_fit$fill], sei = sqrt(tf_fit$vi[tf_fit$fill]), point_type = "Filled" ) plot_df <- bind_rows(observed_df, filled_df) if (isTRUE(input$tf_show_unpub) && nrow(unpublished) > 0) { plot_df <- bind_rows( plot_df, unpublished %>% transmute( yi = yi, sei = sqrt(vi), point_type = "Unpublished (true)" ) ) } plot_df$point_type <- factor( plot_df$point_type, levels = c("Published", "Filled", "Unpublished (true)") ) est_pub <- as.numeric(fit_pub$b[1, 1]) est_tf <- as.numeric(tf_fit$b[1, 1]) # Keep the axis scale fixed regardless of the view (with or without unpublished studies) x_ref <- c(df$yi, tf_fit$yi) y_ref <- c(sqrt(df$vi), sqrt(tf_fit$vi)) x_span <- diff(range(x_ref, na.rm = TRUE)) x_pad <- max(0.12, 0.12 * x_span) x_limits <- c(min(x_ref, na.rm = TRUE) - x_pad, max(x_ref, na.rm = TRUE) + x_pad) y_limits <- c(max(y_ref, na.rm = TRUE), min(y_ref, na.rm = TRUE)) se_seq <- seq(min(y_ref, na.rm = TRUE), max(y_ref, na.rm = TRUE), length.out = 120) cone <- data.frame( sei = se_seq, left = est_tf - 1.96 * se_seq, right = est_tf + 1.96 * se_seq ) subtitle_text <- if (isTRUE(input$tf_show_unpub)) { "Control view: compare filled points with the truly unpublished studies" } else { "Turn on the control option to see the truly unpublished studies" } ggplot(plot_df, aes(yi, sei)) + geom_line(data = cone, aes(x = left, y = sei), inherit.aes = FALSE, color = col_muted, linewidth = 0.9) + geom_line(data = cone, aes(x = right, y = sei), inherit.aes = FALSE, color = col_muted, linewidth = 0.9) + geom_vline(xintercept = est_pub, linetype = "dashed", color = col_primary, linewidth = 1) + geom_vline(xintercept = est_tf, color = col_dark, linewidth = 1) + geom_point(aes(color = point_type, shape = point_type), size = 2.6, alpha = 0.9) + scale_color_manual( values = c( "Published" = col_primary, "Filled" = col_secondary, "Unpublished (true)" = col_muted ) ) + scale_shape_manual( values = c( "Published" = 16, "Filled" = 1, "Unpublished (true)" = 17 ) ) + scale_y_reverse(limits = y_limits) + coord_cartesian(xlim = x_limits) + labs( title = "Funnel after the trim-and-fill correction", subtitle = subtitle_text, x = "Estimated effect (SMD)", y = "Standard error", color = NULL, shape = NULL ) + theme(legend.position = "top") }) output$tf_compare_plot <- renderPlot({ df <- tf_data() published <- df %>% filter(published) if (nrow(df) < 5 || nrow(published) < 10) { plot.new() title("Too little data to compare estimates") return(invisible()) } fit_all <- try(metafor::rma(yi, vi, data = df, method = "REML"), silent = TRUE) fit_pub <- try(metafor::rma(yi, vi, data = published, method = "REML"), silent = TRUE) tf_fit <- try(metafor::trimfill(fit_pub), silent = TRUE) if (inherits(fit_all, "try-error") || inherits(fit_pub, "try-error") || inherits(tf_fit, "try-error")) { plot.new() title("Could not estimate the comparison models") return(invisible()) } sum_df <- data.frame( model = c("All studies", "Published only", "After trim-and-fill"), est = c(as.numeric(fit_all$b[1, 1]), as.numeric(fit_pub$b[1, 1]), as.numeric(tf_fit$b[1, 1])), se = c(fit_all$se, fit_pub$se, tf_fit$se) ) %>% mutate( low = est - 1.96 * se, high = est + 1.96 * se ) ggplot(sum_df, aes(est, model, color = model)) + geom_vline(xintercept = 0, linetype = "dashed", color = col_muted) + geom_errorbarh(aes(xmin = low, xmax = high), height = 0.16, linewidth = 1.4) + geom_point(size = 3.6) + scale_color_manual(values = c( "All studies" = col_secondary, "Published only" = col_primary, "After trim-and-fill" = col_dark )) + labs( title = "Estimate comparison: full set versus selection versus correction", x = "Effect size", y = NULL, color = NULL ) + theme(legend.position = "none") }) output$tf_metrics <- renderUI({ df <- tf_data() published <- df %>% filter(published) if (nrow(df) < 5 || nrow(published) < 10) { return(shiny::div(class = "metric-box", "Too little data for stable trim-and-fill.")) } fit_all <- try(metafor::rma(yi, vi, data = df, method = "REML"), silent = TRUE) fit_pub <- try(metafor::rma(yi, vi, data = published, method = "REML"), silent = TRUE) tf_fit <- try(metafor::trimfill(fit_pub), silent = TRUE) if (inherits(fit_all, "try-error") || inherits(fit_pub, "try-error") || inherits(tf_fit, "try-error")) { return(shiny::div(class = "metric-box", "Trim-and-fill unavailable for the current parameters.")) } est_all <- as.numeric(fit_all$b[1, 1]) est_pub <- as.numeric(fit_pub$b[1, 1]) est_tf <- as.numeric(tf_fit$b[1, 1]) inflation_before <- est_pub - est_all inflation_after <- est_tf - est_all reduction <- if (abs(inflation_before) < 1e-8) { NA_real_ } else { 100 * (1 - inflation_after / inflation_before) } reduction_text <- ifelse(is.na(reduction), "undefined", sprintf("%.1f%%", reduction)) k0_val <- as.integer(round(tf_fit$k0)) shiny::div( class = "metric-box", shiny::HTML(sprintf( "Published: %.1f%% of studies
Filled studies (k0): %d
Pooled effect (all): %.3f
Pooled effect (published): %.3f
Pooled effect (trim-and-fill): %.3f
Reduction in inflation after correction: %s", 100 * mean(df$published), k0_val, est_all, est_pub, est_tf, reduction_text )) ) }) ``` ## What should you take away from this lecture? 1. Meta-analysis can summarize a literature well, but it cannot rescue a weak one. 2. Before trusting the result, check study selection, effect coding, and uncertainty estimation. 3. The distribution of *p* values can sometimes reveal evidential value, selection, or *p-hacking*. 4. *Publication bias* and other selection processes can distort the pooled estimate. 5. A meta-analysis should sharpen the discussion, not end it. ## Questions? {.cat-slide} ```{r} #| fig-width: 10.6 #| fig-height: 6.2 #| out-width: 100% library(ggplot2) ink <- "#000505" indigo <- "#3B3355" grape <- "#5D5D81" sky <- "#BFCDE0" soft <- "#FEFCFD" warm <- "#F3F0F7" circle_df <- function(cx, cy, r, n = 360) { theta <- seq(0, 2 * pi, length.out = n) data.frame( x = cx + r * cos(theta), y = cy + r * sin(theta) ) } diamond_df <- function(cx, cy, w, h) { data.frame( x = c(cx, cx + w / 2, cx, cx - w / 2), y = c(cy + h / 2, cy, cy - h / 2, cy) ) } arc_df <- function(cx, cy, r, start, end, n = 120) { theta <- seq(start, end, length.out = n) data.frame( x = cx + r * cos(theta), y = cy + r * sin(theta) ) } rotate_df <- function(df, angle_deg, cx, cy) { angle <- angle_deg * pi / 180 x_shift <- df$x - cx y_shift <- df$y - cy df$x <- cx + x_shift * cos(angle) - y_shift * sin(angle) df$y <- cy + x_shift * sin(angle) + y_shift * cos(angle) df } rotate_segment_df <- function(df, angle_deg, cx, cy) { angle <- angle_deg * pi / 180 x1 <- df$x - cx y1 <- df$y - cy x2 <- df$xend - cx y2 <- df$yend - cy df$x <- cx + x1 * cos(angle) - y1 * sin(angle) df$y <- cy + x1 * sin(angle) + y1 * cos(angle) df$xend <- cx + x2 * cos(angle) - y2 * sin(angle) df$yend <- cy + x2 * sin(angle) + y2 * cos(angle) df } funnel_points <- function(center_x) { y_vals <- c(1.08, 1.00, 0.92, 0.84, 0.77) half_width <- (1.30 - y_vals) / (1.30 - 0.64) * 0.26 rel_offset <- c(0.00, -0.40, 0.34, -0.28, 0.22) data.frame( x = center_x + half_width * rel_offset, y = y_vals, size = c(4.2, 3.9, 3.6, 3.2, 2.9) ) } left_ear_center <- c(-0.56, 0.97) right_ear_center <- c(0.56, 0.97) left_ear <- rotate_df( data.frame(x = c(-0.82, -0.30, -0.56), y = c(0.64, 0.64, 1.30)), angle_deg = 18, cx = left_ear_center[1], cy = left_ear_center[2] ) right_ear <- rotate_df( data.frame(x = c(0.30, 0.82, 0.56), y = c(0.64, 0.64, 1.30)), angle_deg = -18, cx = right_ear_center[1], cy = right_ear_center[2] ) left_ear_axis <- rotate_segment_df( data.frame(x = -0.56, y = 0.66, xend = -0.56, yend = 1.23), angle_deg = 18, cx = left_ear_center[1], cy = left_ear_center[2] ) right_ear_axis <- rotate_segment_df( data.frame(x = 0.56, y = 0.66, xend = 0.56, yend = 1.23), angle_deg = -18, cx = right_ear_center[1], cy = right_ear_center[2] ) head_outer <- circle_df(0, 0.03, 1.01) head_inner <- circle_df(0, -0.02, 0.84) left_eye <- diamond_df(-0.34, 0.18, 0.28, 0.18) right_eye <- diamond_df(0.34, 0.18, 0.28, 0.18) nose <- diamond_df(0, -0.08, 0.16, 0.12) left_muzzle <- circle_df(-0.16, -0.26, 0.19, n = 180) right_muzzle <- circle_df(0.16, -0.26, 0.19, n = 180) left_mouth <- arc_df(-0.10, -0.28, 0.16, 0.14, 1.76) right_mouth <- arc_df(0.10, -0.28, 0.16, 1.38, 3.00) chin_arc <- arc_df(0, -0.36, 0.09, 0.24, 2.90) left_ear_points <- rotate_df(funnel_points(-0.56), 18, left_ear_center[1], left_ear_center[2]) right_ear_points <- rotate_df(funnel_points(0.56), -18, right_ear_center[1], right_ear_center[2]) ear_points <- rbind(left_ear_points, right_ear_points) whiskers <- data.frame( x_start = c(-0.18, -0.12, -0.08, 0.18, 0.12, 0.08), y_start = c(-0.02, -0.18, -0.34, -0.02, -0.18, -0.34), x_end = c(-1.05, -1.02, -0.96, 1.05, 1.02, 0.96), y_end = c(0.08, -0.18, -0.44, 0.08, -0.18, -0.44), point_t = c(0.42, 0.48, 0.54, 0.42, 0.48, 0.54), point_size = c(4.2, 3.7, 3.2, 4.2, 3.7, 3.2) ) whiskers$x <- whiskers$x_start + whiskers$point_t * (whiskers$x_end - whiskers$x_start) whiskers$y <- whiskers$y_start + whiskers$point_t * (whiskers$y_end - whiskers$y_start) whisker_caps <- do.call( rbind, lapply(seq_len(nrow(whiskers)), function(i) { w <- whiskers[i, ] dx <- w$x_end - w$x_start dy <- w$y_end - w$y_start len <- sqrt(dx^2 + dy^2) nx <- -dy / len * 0.035 ny <- dx / len * 0.035 rbind( data.frame( x = w$x_start - nx, y = w$y_start - ny, xend = w$x_start + nx, yend = w$y_start + ny ), data.frame( x = w$x_end - nx, y = w$y_end - ny, xend = w$x_end + nx, yend = w$y_end + ny ) ) }) ) ggplot() + geom_polygon( data = head_outer, aes(x, y), fill = warm, color = indigo, linewidth = 1.25 ) + geom_polygon( data = head_inner, aes(x, y), fill = grDevices::adjustcolor(sky, alpha.f = 0.18), color = NA ) + geom_polygon( data = left_ear, aes(x, y), fill = grDevices::adjustcolor(sky, alpha.f = 0.45), color = indigo, linewidth = 1.1 ) + geom_polygon( data = right_ear, aes(x, y), fill = grDevices::adjustcolor(sky, alpha.f = 0.45), color = indigo, linewidth = 1.1 ) + geom_segment( data = rbind(left_ear_axis, right_ear_axis), aes(x = x, xend = xend, y = y, yend = yend), color = grDevices::adjustcolor(indigo, alpha.f = 0.55), linetype = "dashed", linewidth = 0.7 ) + geom_point( data = ear_points, aes(x, y, size = size), shape = 21, fill = soft, color = indigo, stroke = 0.7 ) + geom_segment( data = data.frame( x = c(-0.52, 0.16), xend = c(-0.16, 0.52), y = c(0.36, 0.36), yend = c(0.31, 0.31) ), aes(x = x, xend = xend, y = y, yend = yend), color = grDevices::adjustcolor(indigo, alpha.f = 0.72), linewidth = 1 ) + geom_segment( data = data.frame( x = c(-0.52, 0.16), xend = c(-0.16, 0.52), y = c(0.28, 0.28), yend = c(0.33, 0.33) ), aes(x = x, xend = xend, y = y, yend = yend), color = grDevices::adjustcolor(indigo, alpha.f = 0.72), linewidth = 1 ) + geom_segment( data = data.frame( xmin = c(-0.55, 0.13), xmax = c(-0.13, 0.55), y = c(0.18, 0.18) ), aes(x = xmin, xend = xmax, y = y, yend = y), color = indigo, linewidth = 1.1 ) + geom_polygon( data = left_eye, aes(x, y), fill = grape, color = indigo, linewidth = 0.95 ) + geom_polygon( data = right_eye, aes(x, y), fill = grape, color = indigo, linewidth = 0.95 ) + geom_point( data = data.frame(x = c(-0.34, 0.34), y = c(0.18, 0.18)), aes(x, y), shape = 21, size = 3.2, fill = soft, color = indigo, stroke = 0.9 ) + geom_segment( data = whiskers, aes(x = x_start, xend = x_end, y = y_start, yend = y_end), color = indigo, linewidth = 0.95 ) + geom_segment( data = whisker_caps, aes(x = x, xend = xend, y = y, yend = yend), color = indigo, linewidth = 0.85 ) + geom_point( data = whiskers, aes(x, y, size = point_size), shape = 15, color = grape ) + geom_polygon( data = left_muzzle, aes(x, y), fill = grDevices::adjustcolor(soft, alpha.f = 0.72), color = NA ) + geom_polygon( data = right_muzzle, aes(x, y), fill = grDevices::adjustcolor(soft, alpha.f = 0.72), color = NA ) + geom_polygon( data = nose, aes(x, y), fill = indigo, color = ink, linewidth = 0.9 ) + geom_path( data = left_mouth, aes(x, y), color = indigo, linewidth = 1 ) + geom_path( data = right_mouth, aes(x, y), color = indigo, linewidth = 1 ) + annotate( "segment", x = 0, xend = 0, y = -0.14, yend = -0.28, color = indigo, linewidth = 0.95 ) + geom_path( data = chin_arc, aes(x, y), color = grDevices::adjustcolor(indigo, alpha.f = 0.72), linewidth = 0.9 ) + coord_equal( xlim = c(-1.32, 1.32), ylim = c(-1.32, 1.55), clip = "off" ) + scale_size_identity() + theme_void() + theme( plot.background = element_rect(fill = soft, color = NA), plot.margin = margin(0, 6, 0, 6) ) ```