File size: 25,232 Bytes

"""Aggregate per-seed metrics.json into paper-style result tables (mean±SD).

Scans results/<exp_name>/**/seed*/metrics.json, groups by (dataset, protocol, arch),
reports mean±SD over seeds (over folds for CV datasets). Emits:
  - summary.csv  : full per-(dataset,method) detail, every metric (raw data export)
  - summary.md   : the main Dice table, methods×datasets (quick read)
  - summary.tex  : the main Dice table as booktabs LaTeX (paper-ready)
  - summary.html : full paper-style report (main tables, per-class, significance, setup)

  python framework/report/aggregate.py --exp_name baselines [--out_root results]
"""
from __future__ import annotations

import os
import json
import glob
import argparse
import warnings
from collections import defaultdict

import numpy as np

# per-image Dice vectors can have all-NaN positions (empty masks across seeds);
# np.nanmean warns harmlessly on those — silence it for clean console/report runs.
warnings.filterwarnings("ignore", message="Mean of empty slice")

# (key, label, is_percent, higher_is_better)
METRICS = [
    ("dice", "Dice", True, True),
    ("iou", "IoU", True, True),
    ("hd95", "HD95", False, False),
    ("assd", "ASSD", False, False),
    ("sensitivity", "Sens", True, True),
    ("specificity", "Spec", True, True),
    ("precision", "Prec", True, True),
]


def load_runs(out_root, exp_name):
    runs = []
    for path in glob.glob(os.path.join(out_root, exp_name, "**", "seed*", "metrics.json"), recursive=True):
        try:
            with open(path) as f:
                runs.append(json.load(f))
        except Exception:
            pass
    return runs


_PROTO_LABEL = {
    ("idridd_segmentation", "fold01"): "official",
    ("busi", "fold01"): "single-split",
    ("medsegdb_kits19", "fold01"): "single-split",
    ("pannuke_semantic", "fold01"): "single-split",
}
_CV_DATASETS = {"pannuke_semantic"}


def _proto_label(dataset, protocol):
    return _PROTO_LABEL.get((dataset, protocol), protocol)


def _agg_over(items, key):
    vals = np.array([it.get("metrics", {}).get(f"{key}_mean", np.nan) for it in items], np.float64)
    vals = vals[~np.isnan(vals)]
    return (float(vals.mean()), float(vals.std())) if vals.size else (float("nan"), float("nan"))


def summarize(runs):
    by_da = defaultdict(lambda: defaultdict(list))
    for d in runs:
        by_da[(d.get("dataset"), d.get("arch"))][d.get("protocol")].append(d)
    rows = []
    for (dataset, arch), proto_map in sorted(by_da.items()):
        protos = sorted(p for p in proto_map if p is not None)
        row = {"dataset": dataset, "arch": arch}
        if dataset in _CV_DATASETS and len(protos) > 1:
            row["protocol"] = f"{len(protos)}-fold"
            row["n_seeds"] = len(protos)
            for key, _, _, _ in METRICS:
                fold_means = [m for m in (_agg_over(proto_map[p], key)[0] for p in protos)
                              if not np.isnan(m)]
                fm = np.array(fold_means, np.float64)
                row[f"{key}_mean"] = float(fm.mean()) if fm.size else float("nan")
                row[f"{key}_sd"] = float(fm.std()) if fm.size else float("nan")
        else:
            proto = protos[0] if protos else None
            items = proto_map.get(proto, [])
            row["protocol"] = _proto_label(dataset, proto)
            row["n_seeds"] = len(items)
            for key, _, _, _ in METRICS:
                row[f"{key}_mean"], row[f"{key}_sd"] = _agg_over(items, key)
        rows.append(row)
    return rows


# ----------------------------------------------------------------------------- display
_ARCH_ORDER = ["unet", "unetpp", "deeplabv3plus", "attention_unet", "transunet", "swinunet",
               "nnunet", "umamba"]
_ARCH_DISP = {"unet": "UNet", "unetpp": "UNet++", "deeplabv3plus": "DeepLabV3+",
              "attention_unet": "Attention-UNet", "transunet": "TransUNet",
              "swinunet": "Swin-UNet", "nnunet": "nnU-Net", "umamba": "U-Mamba"}
_DS_ORDER = ["cvc_clinicdb", "kvasir_seg", "fives", "busi", "refuge2", "acdc_png",
             "idridd_segmentation", "pannuke_semantic", "medsegdb_isic2018", "medsegdb_kits19"]
_DS_DISP = {"cvc_clinicdb": "CVC-ClinicDB", "kvasir_seg": "Kvasir-SEG", "fives": "FIVES",
            "busi": "BUSI", "refuge2": "REFUGE2", "acdc_png": "ACDC",
            "idridd_segmentation": "IDRiD", "pannuke_semantic": "PanNuke",
            "medsegdb_isic2018": "ISIC2018", "medsegdb_kits19": "KiTS19"}


def _fmt(row, key, pct):
    m, s = row[f"{key}_mean"], row[f"{key}_sd"]
    if m != m:
        return "—"
    return f"{m*100:.2f}±{s*100:.2f}" if pct else f"{m:.2f}±{s:.2f}"


def _grid(rows):
    cell = {(r["dataset"], r["arch"]): r for r in rows}
    methods = [a for a in _ARCH_ORDER if any(r["arch"] == a for r in rows)] or \
        sorted({r["arch"] for r in rows})
    seen = [d for d in _DS_ORDER if any(r["dataset"] == d for r in rows)]
    extra = [r["dataset"] for r in rows if r["dataset"] not in _DS_ORDER]
    datasets = list(dict.fromkeys(seen + extra))
    return cell, datasets, methods


# ----------------------------------------------------------------------------- significance
def _per_image_dice_vec(runs_for_da):
    by_proto = defaultdict(list)
    for d in runs_for_da:
        by_proto[d.get("protocol")].append(d)
    parts = []
    for proto in sorted(by_proto):
        arrs = [np.array([pi.get("dice", np.nan) for pi in d.get("per_image", [])], float)
                for d in by_proto[proto]]
        arrs = [a for a in arrs if a.size]
        if not arrs:
            continue
        L = min(a.size for a in arrs)
        parts.append(np.nanmean(np.stack([a[:L] for a in arrs]), axis=0))
    return np.concatenate(parts) if parts else np.array([])


def _sig_tied_sets(runs):
    """{dataset: set(archs whose per-image Dice is NOT significantly worse than the best,
    paired Wilcoxon p>=0.05)} — the 'statistically best' set, used to bold the Dice table."""
    try:
        from scipy.stats import wilcoxon
    except Exception:
        return {}
    by_da = defaultdict(list)
    for d in runs:
        by_da[(d.get("dataset"), d.get("arch"))].append(d)

    def pval(a, b):
        L = min(a.size, b.size)
        if L < 6:
            return float("nan")
        x, y = a[:L], b[:L]
        m = ~(np.isnan(x) | np.isnan(y))
        if m.sum() < 6 or np.allclose(x[m], y[m]):
            return 1.0
        try:
            return float(wilcoxon(x[m], y[m]).pvalue)
        except Exception:
            return 1.0

    out = {}
    for ds in {k[0] for k in by_da}:
        vecs = {a: _per_image_dice_vec(by_da[(ds, a)]) for a in _ARCH_ORDER if (ds, a) in by_da}
        vecs = {a: v for a, v in vecs.items() if v.size}
        if not vecs:
            continue
        means = {a: float(np.nanmean(v)) for a, v in vecs.items()}
        best = max(means, key=means.get)
        tied = {best}
        for a, v in vecs.items():
            if a != best and not (pval(vecs[best], v) < 0.05):
                tied.add(a)
        out[ds] = tied
    return out


# ----------------------------------------------------------------------------- text exports
def to_csv(rows):
    cols = ["dataset", "protocol", "arch", "n_seeds"]
    for k, _, _, _ in METRICS:
        cols += [f"{k}_mean", f"{k}_sd"]
    out = ",".join(cols) + "\n"
    for r in rows:
        out += ",".join(str(r[c]) for c in cols) + "\n"
    return out


def _dice_matrix(rows):
    """(methods, datasets, cell, avg) for the main Dice table."""
    cell, datasets, methods = _grid(rows)
    avg = {a: np.nanmean([cell[(d, a)]["dice_mean"] for d in datasets if (d, a) in cell] or [np.nan])
           for a in methods}
    return cell, datasets, methods, avg


def _dice_bold(a, d, cell, best, sig):
    """Whether (dataset d, arch a)'s Dice cell should be bold: in the significance
    'tied-for-best' set when available, else the single best per dataset."""
    if (d, a) not in cell:
        return False
    if sig is not None:
        return a in sig.get(d, set())
    return cell[(d, a)]["dice_mean"] == best[d]


def to_markdown(rows, sig=None):
    cell, datasets, methods, _ = _dice_matrix(rows)
    head = ["Method"] + [_DS_DISP.get(d, d) for d in datasets]
    out = "## Main results — Dice (mean±SD %, ↑)\n\n"
    out += ("_**Bold** = best or not significantly worse than best per dataset "
            "(paired Wilcoxon on per-image Dice, p≥0.05). No cross-dataset average column — "
            "the seven modalities are too heterogeneous for one number to be meaningful._\n\n")
    out += "| " + " | ".join(head) + " |\n|" + "---|" * len(head) + "\n"
    best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan)
            for d in datasets}
    for a in methods:
        cells = [_ARCH_DISP.get(a, a)]
        for d in datasets:
            if (d, a) in cell:
                t = _fmt(cell[(d, a)], "dice", True)
                cells.append(f"**{t}**" if _dice_bold(a, d, cell, best, sig) else t)
            else:
                cells.append("–")
        out += "| " + " | ".join(cells) + " |\n"
    return out


def to_latex(rows, sig=None):
    cell, datasets, methods, _ = _dice_matrix(rows)
    spec = "l" + "c" * len(datasets)
    out = ("% Main results: Dice (mean over seeds, %). Bold = best or not significantly\n"
           "% worse than best per dataset (paired Wilcoxon on per-image Dice, p>=0.05).\n"
           "% No cross-dataset average column (modalities too heterogeneous).\n")
    out += "\\begin{tabular}{" + spec + "}\n\\toprule\n"
    out += "Method & " + " & ".join(_DS_DISP.get(d, d) for d in datasets) + " \\\\\n\\midrule\n"
    best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan)
            for d in datasets}
    for a in methods:
        cells = [_ARCH_DISP.get(a, a)]
        for d in datasets:
            if (d, a) in cell:
                t = f"{cell[(d, a)]['dice_mean'] * 100:.1f}"
                cells.append(f"\\textbf{{{t}}}" if _dice_bold(a, d, cell, best, sig) else t)
            else:
                cells.append("--")
        out += " & ".join(cells) + " \\\\\n"
        if a == "attention_unet":
            out += "\\midrule\n"  # separate CNNs from transformers/foundation
    out += "\\bottomrule\n\\end{tabular}\n"
    return out


# ----------------------------------------------------------------------------- HTML report
_DATASETS_INFO = [
    ("1", "CVC-ClinicDB", "Colonoscopy (endoscopy)", "Polyp", "2", "RGB", "384×288", "official", "490 / 61 / 61"),
    ("2", "Kvasir-SEG", "GI endoscopy", "Polyp", "2", "RGB", "~622×529 (var)", "official", "800 / 100 / 100"),
    ("3", "FIVES", "Retinal fundus", "Vessel", "2", "RGB", "2048×2048", "official", "480 / 120 / 200"),
    ("4", "BUSI", "Breast ultrasound", "Tumor", "2", "grayscale¹", "variable", "single-split²", "545 / 78 / 157"),
    ("5", "REFUGE2", "Retinal fundus", "Optic disc & cup", "3", "RGB", "~2124×2056", "official", "400 / 400 / 400"),
    ("6", "ACDC", "Cardiac MRI (2D slices)", "RV / Myo / LV", "4", "grayscale", "~240×256 (var)", "official", "136 / 210 / 380"),
    ("7", "IDRiD", "Retinal fundus", "DR lesions (4) + optic disc", "6", "RGB", "4288×2848", "official", "43 / 11 / 27"),
    ("8", "PanNuke", "Histopathology (H&E)", "Nuclei (5 types)", "6", "RGB", "256×256", "official 3-fold CV", "~2.7k / 2.6k / 2.6k per fold"),
    ("9", "ISIC2018", "Dermoscopy", "Skin lesion", "2", "RGB", "256×256", "holdout", "2582 / 369 / 737"),
    ("10", "KiTS19", "Kidney CT (2D slices)", "Kidney (binary)", "2", "grayscale¹", "256×256", "single-split²", "2832 / 479 / 705"),
]
_METHODS_INFO = [
    ("UNet", "CNN encoder–decoder", "SMP, ResNet-50 encoder (ImageNet)"),
    ("UNet++", "Nested UNet", "SMP, ResNet-50 (ImageNet)"),
    ("DeepLabV3+", "Atrous CNN", "SMP, ResNet-50 (ImageNet)"),
    ("Attention-UNet", "Attention-gated UNet", "Re-implemented, from scratch"),
    ("TransUNet", "CNN–Transformer hybrid", "R50-ViT-B/16 (ImageNet), input 256"),
    ("Swin-UNet", "Pure-Transformer UNet", "Swin-Tiny (ImageNet), input 224"),
    ("nnU-Net (v2)", "Self-configuring CNN", "2D config, 250 epochs"),
    ("U-Mamba", "State-space (Mamba) UNet", "U-Mamba_Bot, 100 epochs"),
]
_METRICS_INFO = [
    ("Dice (DSC)", "2TP / (2TP+FP+FN)", "↑", "%", "区域重叠度(主指标),对类别不平衡较鲁棒。"),
    ("IoU (Jaccard)", "TP / (TP+FP+FN)", "↑", "%", "交并比,更严格的重叠度,常与 Dice 并列。"),
    ("HD95", "95% Hausdorff distance (boundaries)", "↓", "px", "边界最大误差的95%分位,越小边界越贴合。"),
    ("ASSD", "average symmetric surface distance", "↓", "px", "平均对称表面距离,整体边界吻合度。"),
    ("Sensitivity", "TP / (TP+FN)", "↑", "%", "召回/敏感度,反映漏分割程度。"),
    ("Specificity", "TN / (TN+FP)", "↑", "%", "特异度,背景误报控制。"),
    ("Precision", "TP / (TP+FP)", "↑", "%", "精确率,反映过分割/误报程度。"),
]
_PERCLASS_NAMES = {
    "acdc_png": {"1": "RV", "2": "Myocardium", "3": "LV"},
    "refuge2": {"1": "Optic Disc", "2": "Optic Cup"},
    "idridd_segmentation": {"1": "MA", "2": "Haemorrhage", "3": "Hard Exudate", "4": "Soft Exudate", "5": "Optic Disc"},
    "pannuke_semantic": {"1": "Neoplastic", "2": "Inflammatory", "3": "Connective", "4": "Dead", "5": "Epithelial"},
}


def _collect_perclass(runs):
    acc = defaultdict(lambda: defaultdict(list))
    for d in runs:
        key = (d.get("dataset"), d.get("arch"))
        for pi in d.get("per_image", []):
            for c, m in (pi.get("per_class") or {}).items():
                v = (m or {}).get("dice")
                if v is not None and v == v:
                    acc[key][c].append(v)
    return {k: {c: float(np.mean(v)) for c, v in cd.items() if v} for k, cd in acc.items()}


_CSS = """
body{font-family:'Helvetica Neue',Arial,sans-serif;margin:30px auto;max-width:1180px;color:#1a1a1a;line-height:1.5}
h1{font-size:21px;margin:0 0 4px}h2{font-size:15px;color:#0a5a33;margin:30px 0 4px;border-bottom:1px solid #e3e3e3;padding-bottom:3px}
h3{font-size:13px;margin:16px 0 4px;color:#333}
p,li{font-size:13px}code{background:#f2f2f2;padding:1px 4px;border-radius:3px}
.cap{color:#666;font-size:11.5px;margin:3px 0 6px}
.tw{overflow-x:auto}
table.rt{border-collapse:collapse;margin:6px 0 8px;font-size:11.5px}
table.rt th,table.rt td{padding:4px 9px;text-align:center;white-space:nowrap}
table.rt thead th{border-top:2px solid #222;border-bottom:1.2px solid #222;font-weight:600}
table.rt tbody tr:last-child td{border-bottom:2px solid #222}
table.rt td.m,table.rt th.m{text-align:left;font-weight:600}
table.rt td.avg,table.rt th.avg{border-left:1px solid #c8c8c8;background:#f7f9f8}
table.rt tbody tr.grp td{border-top:1px solid #cfcfcf}
table.rt b{color:#08402a}
table.info{border-collapse:collapse;margin:6px 0 14px;font-size:12px}
table.info th,table.info td{border:1px solid #ddd;padding:4px 8px;text-align:center}
table.info th{background:#f3f3f3}table.info td.l{text-align:left}
.note{background:#eef7f0;border-left:3px solid #0a6;padding:8px 12px;font-size:12.5px;margin:8px 0}
hr{border:none;border-top:1px solid #e3e3e3;margin:24px 0}
"""


def _metric_table(cell, datasets, methods, key, pct, hib, bold_sets=None):
    """Transposed table: methods (rows) × datasets (cols). bold_sets[ds] (set of archs)
    if given (Dice significance), else bold the single best per column. Deliberately NO
    cross-dataset summary column: the ten datasets span seven modalities with very
    different difficulty, so a simple average is not meaningful (and would conflict with
    the per-dataset ranking)."""
    best = {}
    for d in datasets:
        vals = {a: cell[(d, a)][f"{key}_mean"] for a in methods
                if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]}
        best[d] = ((max if hib else min)(vals, key=vals.get) if vals else None)
    h = ["<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>"
         + "".join(f"<th>{_DS_DISP.get(d, d)}</th>" for d in datasets)
         + "</tr></thead><tbody>"]
    for a in methods:
        grp = " class='grp'" if a == "transunet" else ""
        tds = [f"<td class='m'>{_ARCH_DISP.get(a, a)}</td>"]
        for d in datasets:
            if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]:
                t = _fmt(cell[(d, a)], key, pct)
                b = (a in bold_sets.get(d, set())) if bold_sets is not None else (a == best[d])
                tds.append(f"<td>{'<b>'+t+'</b>' if b else t}</td>")
            else:
                tds.append("<td>–</td>")
        h.append(f"<tr{grp}>" + "".join(tds) + "</tr>")
    h.append("</tbody></table></div>")
    return "\n".join(h)


def _perclass_section(runs):
    pc = _collect_perclass(runs)
    h = []
    for ds, names in _PERCLASS_NAMES.items():
        methods = [a for a in _ARCH_ORDER if (ds, a) in pc and pc[(ds, a)]]
        if not methods:
            continue
        classes = sorted(names, key=int)
        colbest = {c: max((pc[(ds, a)].get(c, float('nan')) for a in methods), default=float('nan'))
                   for c in classes}
        h.append(f"<h3>{_DS_DISP.get(ds, ds)}</h3>")
        h.append("<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>"
                 + "".join(f"<th>{names[c]}</th>" for c in classes) + "<th class='avg'>macro</th></tr></thead><tbody>")
        for a in methods:
            grp = " class='grp'" if a == "transunet" else ""
            cells, present = [], []
            for c in classes:
                v = pc[(ds, a)].get(c)
                if v is None:
                    cells.append("<td>–</td>")
                else:
                    present.append(v)
                    t = f"{v*100:.1f}"
                    cells.append(f"<td>{'<b>'+t+'</b>' if v == colbest[c] else t}</td>")
            macro = (sum(present) / len(present) * 100) if present else float("nan")
            h.append(f"<tr{grp}><td class='m'>{_ARCH_DISP.get(a, a)}</td>{''.join(cells)}"
                     f"<td class='avg'>{macro:.1f}</td></tr>")
        h.append("</tbody></table></div>")
    return "\n".join(h)


def _setup_html():
    h = ["<h2>A. Datasets</h2>",
         "<table class='info'><tr><th>#</th><th>Dataset</th><th>Modality</th><th>Target</th><th>Cls</th>"
         "<th>Ch</th><th>Native size</th><th>Protocol</th><th>Train/Val/Test</th></tr>"]
    for r in _DATASETS_INFO:
        h.append("<tr><td>%s</td><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td><td>%s</td>"
                 "<td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" % r)
    h.append("</table>")
    h.append("<div class='cap'>¹ BUSI/KiTS19 grayscale stored as 3-ch PNG (read as grayscale). "
             "² no canonical split → one fixed fold (of 5) with 3 seeds; others use the official split. "
             "Labels 0…C-1 (0=bg); multi-class metrics macro-averaged over foreground classes.</div>")
    h.append("<h2>B. Methods</h2>")
    h.append("<table class='info'><tr><th>Method</th><th>Family</th><th>Backbone / setup</th></tr>")
    for m in _METHODS_INFO:
        h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td></tr>" % m)
    h.append("</table>")
    h.append("<h2>C. Metrics</h2>")
    h.append("<table class='info'><tr><th>Metric</th><th>Definition</th><th>Dir</th><th>Unit</th>"
             "<th>作用 / 含义(中文)</th></tr>")
    for m in _METRICS_INFO:
        h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td>%s</td><td>%s</td><td class='l'>%s</td></tr>" % m)
    h.append("</table>")
    return "\n".join(h)


def to_html(rows, runs=None, title="SegGen benchmark", sig=None):
    cell, datasets, methods = _grid(rows)
    if sig is None:
        sig = _sig_tied_sets(runs) if runs else None
    h = [f"<!doctype html><html><head><meta charset='utf-8'><title>{title}</title><style>{_CSS}</style>"
         "</head><body>"]
    h.append(f"<h1>{title}: 8 methods × 10 datasets (unified 512, resolution-fair)</h1>")
    h.append("<p>Eight 2D medical-image segmentation methods on ten public datasets (seven modalities). "
             "Values are <b>mean±SD</b> over 3 seeds (over the 3 folds for PanNuke). "
             "Each (dataset,method) cell aggregates tens–thousands of test images.</p>")
    h.append("<div class='note'><b>Resolution-fair protocol.</b> Convolutional nets train at 512; the fixed-input "
             "transformers (Swin-UNet 224, TransUNet 256) and nnU-Net/U-Mamba run at their native size; "
             "<b>every prediction and ground truth is then resized to a common 512×512 before scoring</b>, so "
             "boundary metrics (HD95/ASSD, in pixels) are directly comparable across methods.</div>")

    h.append("<h2>1. Main results — Dice (%) ↑</h2>")
    h.append("<div class='cap'><b>Bold</b> = best, or not significantly different from the best per dataset "
             "(paired Wilcoxon on per-image Dice, p≥0.05). "
             "Horizontal rule separates CNNs (top) from Transformer / foundation models (bottom). "
             "No cross-dataset average is reported — the seven modalities differ too much in difficulty "
             "for a single number to be meaningful.</div>")
    h.append(_metric_table(cell, datasets, methods, "dice", True, True, bold_sets=sig))

    h.append("<h2>2. Boundary accuracy — HD95 (px) ↓</h2>")
    h.append("<div class='cap'>95% Hausdorff distance at the common 512 resolution (lower = better; "
             "<b>bold</b> = best per dataset). Now comparable across methods.</div>")
    h.append(_metric_table(cell, datasets, methods, "hd95", False, False))

    h.append("<h2>3. Overlap — IoU (%) ↑</h2>")
    h.append("<div class='cap'>Jaccard index, the stricter overlap measure (<b>bold</b> = best per dataset).</div>")
    h.append(_metric_table(cell, datasets, methods, "iou", True, True))

    if runs:
        pcs = _perclass_section(runs)
        if pcs.strip():
            h.append("<h2>4. Per-class Dice (%) — multi-class datasets</h2>")
            h.append("<div class='cap'>Mean per-class Dice over all test images/runs (0=background excluded; "
                     "<b>bold</b>=best per class). The <i>macro</i> column weights each foreground class "
                     "equally (a within-dataset mean, not a cross-dataset one). It can differ by ~1 pt from "
                     "the §1 Dice — which is image-weighted (each image is first averaged over the classes it "
                     "contains) — whenever some images lack a class (e.g. ACDC's RV appears in only 335/380 "
                     "images); both conventions are standard, neither is an error.</div>")
            h.append(pcs)

    h.append("<h2>5. Supplementary metrics — Sensitivity &amp; Precision (%) ↑</h2>")
    h.append("<div class='cap'>Two complementary error views (<b>bold</b> = best per dataset): low "
             "<b>Sensitivity</b> (recall) signals under-segmentation (missed foreground); low "
             "<b>Precision</b> signals over-segmentation (false positives). <i>Specificity</i> is omitted "
             "— background dominates, so it stays &gt;96% with almost no spread across methods (≤0.6 pt on "
             "average) — and <i>ASSD</i> is omitted as redundant with HD95; both, and every metric, are "
             "tabulated in full in <code>summary.csv</code>.</div>")
    h.append("<h3>Sensitivity / recall ↑</h3>")
    h.append(_metric_table(cell, datasets, methods, "sensitivity", True, True))
    h.append("<h3>Precision ↑</h3>")
    h.append(_metric_table(cell, datasets, methods, "precision", True, True))

    h.append("<hr><h2>Appendix — Experimental setup</h2>")
    h.append("<p class='cap'>Full per-(dataset,method) values for <b>every</b> metric "
             "(IoU, HD95, ASSD, Sensitivity, Specificity, Precision, …) are in "
             "<code>summary.csv</code>; the Dice table as LaTeX is in <code>summary.tex</code>.</p>")
    h.append(_setup_html())
    h.append("</body></html>")
    return "\n".join(h)


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--exp_name", required=True)
    p.add_argument("--out_root", default="results")
    args = p.parse_args()

    runs = load_runs(args.out_root, args.exp_name)
    if not runs:
        print(f"no metrics.json under {args.out_root}/{args.exp_name}")
        return
    rows = summarize(runs)
    sig = _sig_tied_sets(runs)
    base = os.path.join(args.out_root, args.exp_name)
    open(os.path.join(base, "summary.csv"), "w").write(to_csv(rows))
    open(os.path.join(base, "summary.md"), "w").write(to_markdown(rows, sig))
    open(os.path.join(base, "summary.tex"), "w").write(to_latex(rows, sig))
    open(os.path.join(base, "summary.html"), "w").write(
        to_html(rows, runs, title=f"SegGen benchmark ({args.exp_name})", sig=sig))
    print(to_markdown(rows, sig))
    print(f"{len(runs)} runs -> {len(rows)} (dataset,arch) cells; written {base}/summary.{{csv,md,tex,html}}")


if __name__ == "__main__":
    main()