Source code for thesis.workflows.qc.statistics

"""Collect tractography statistics from per-patient output directories.

Reads ``waytotal``, ``fdt_paths.nii.gz``, and ROI mask files to produce
a structured summary of tractography results for one or more subjects.
Works with both ProbTrackX2 (``tractography/probtrackx2``, the default)
and MRtrix3 (``tractography/mrtrix3``) layouts via the
``tractography_relpath`` parameter.

For cross-method comparison the collector also emits a unified
**Normalized Connection Strength (NCS)** per subject and per hemisphere:

* ProbTrackX2:
  ``NCS = target_total_streamlines / (n_samples × |seed_voxels|)`` —
  per-sample arrival probability in ``[0, 1]``.
* MRtrix3:
  ``NCS = mu × target_total_streamlines`` — Fibre Bundle Capacity
  (units of fibre cross-sectional area). The target sum is already
  SIFT2-weighted (the per-target TDI uses ``-tck_weights_in``), so
  multiplying by ``mu`` from ``sift2_mu.txt`` gives the FBC value.

The two values are different physical quantities; cross-method
comparison should use Spearman rank / z-scored profiles, not absolute
equality.

Example:
    >>> from thesis.workflows.qc.statistics import collect_tractography_stats
    >>> stats = collect_tractography_stats(Path("outputs/114823"))
    >>> print(stats["waytotal"])
    42351
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np

from thesis.core.logging import get_logger

logger = get_logger(__name__)

try:
    import nibabel as nib

    NIBABEL_AVAILABLE = True
except ImportError:  # pragma: no cover
    NIBABEL_AVAILABLE = False

__all__ = [
    "collect_tractography_stats",
    "collect_batch_stats",
    "format_stats_table",
]


def _detect_backend(tractography_relpath: str) -> str:
    """Return ``"mrtrix3"`` or ``"probtrackx2"`` based on the relpath.

    Falls back to ``"probtrackx2"`` for unrecognised paths because that
    is the historical default and matches the bare ``waytotal`` schema.
    """
    rel = tractography_relpath.lower()
    if "mrtrix3" in rel:
        return "mrtrix3"
    return "probtrackx2"


def _read_tractography_params(tractography_dir: Path) -> Dict[str, Any]:
    """Read ``tractography_params.json`` if present.

    Returns the parsed dict on success, or ``{}`` when the file is
    missing or unparsable. Keys present in the file (``backend``,
    ``n_samples`` for ProbTrackX2; ``backend``, ``select``, ``mu`` for
    MRtrix3) are passed through unchanged.
    """
    params_file = tractography_dir / "tractography_params.json"
    if not params_file.is_file():
        return {}
    try:
        parsed: Any = json.loads(params_file.read_text())
    except (OSError, ValueError) as exc:
        logger.debug("Could not read {}: {}", params_file, exc)
        return {}
    return parsed if isinstance(parsed, dict) else {}


def _read_sift2_mu_file(tractography_dir: Path) -> Optional[float]:
    """Read ``sift2_mu.txt`` (single float) if present.

    Returns ``None`` when the file is missing or unparsable. This is
    the MRtrix3 proportionality coefficient that converts SIFT2 weight
    sums into Fibre Bundle Capacity values.
    """
    mu_file = tractography_dir / "sift2_mu.txt"
    if not mu_file.is_file():
        return None
    try:
        text = mu_file.read_text().strip()
    except OSError:
        return None
    for token in text.split():
        try:
            return float(token)
        except ValueError:
            continue
    return None


def _classify_roi_voxel_count(
    roi_counts: Dict[str, int],
    keyword: str,
) -> int:
    """Sum voxel counts for ROI keys whose name contains ``keyword``.

    The canonical 5-tuple uses names like ``main/medulla-seed_transformed``
    and ``main/precentral-target_transformed``. Matching on the substring
    (case-insensitive) covers single-side and hemisphere-prefixed layouts
    (``"left:main/medulla-seed_transformed"``).

    Args:
        roi_counts: Output of :func:`_roi_voxel_counts` (possibly
            hemisphere-prefixed via :func:`_prefix_roi_counts`).
        keyword: ``"seed"`` or ``"target"``.

    Returns:
        Sum of matching counts, or ``0`` if nothing matches.
    """
    total = 0
    kw = keyword.lower()
    for name, count in roi_counts.items():
        if kw in name.lower() and int(count) > 0:
            total += int(count)
    return total


def _compute_ncs(
    backend: str,
    target_streamlines: Optional[float],
    seed_voxels: int,
    n_samples: Optional[int] = None,
    mu: Optional[float] = None,
) -> Optional[float]:
    """Compute the Normalized Connection Strength for one run.

    See module docstring for the per-backend formula. Returns ``None``
    when any input required by the active backend is missing or
    non-positive.
    """
    if target_streamlines is None or target_streamlines <= 0:
        return None
    if backend == "probtrackx2":
        if not n_samples or seed_voxels <= 0:
            return None
        denom = float(n_samples) * float(seed_voxels)
        if denom <= 0:
            return None
        return float(target_streamlines) / denom
    if backend == "mrtrix3":
        if mu is None:
            return None
        return float(mu) * float(target_streamlines)
    return None


def _discover_tractography_dirs(tractography_dir: Path) -> List[Path]:
    """Return tractography directories to inspect.

    If ``left/`` and/or ``right/`` subdirectories exist, they are treated as
    independent hemisphere runs. Otherwise the base directory itself is used.

    Args:
        tractography_dir: Base ``probtrackx2`` output directory.

    Returns:
        List of tractography directories to inspect.
    """
    hemisphere_dirs = [
        hemi_dir
        for hemi_dir in (tractography_dir / "left", tractography_dir / "right")
        if hemi_dir.is_dir()
    ]
    return hemisphere_dirs or [tractography_dir]


def _merge_space_stats(stats_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Merge one or more ``_fdt_paths_stats`` dictionaries.

    Args:
        stats_list: Per-run stats dictionaries.

    Returns:
        Aggregated stats dictionary.
    """
    if not stats_list:
        return {}

    total_streamlines = int(sum(int(s.get("total_streamlines", 0)) for s in stats_list))
    nonzero_voxels = int(sum(int(s.get("nonzero_voxels", 0)) for s in stats_list))
    total_voxels = int(sum(int(s.get("total_voxels", 0)) for s in stats_list))
    max_density = int(max(int(s.get("max_density", 0)) for s in stats_list))

    if total_voxels == 0 or nonzero_voxels == 0:
        return {
            "total_streamlines": total_streamlines,
            "nonzero_voxels": nonzero_voxels,
            "total_voxels": total_voxels,
            "volume_fraction": 0.0,
            "mean_density": 0.0,
            "max_density": max_density,
            "median_density": 0.0,
        }

    mean_density = round(total_streamlines / nonzero_voxels, 2)
    volume_fraction = round(nonzero_voxels / total_voxels, 6)

    median_candidates = [float(s.get("median_density", 0.0)) for s in stats_list if s]
    median_density = round(float(np.median(median_candidates)), 2) if median_candidates else 0.0

    return {
        "total_streamlines": total_streamlines,
        "nonzero_voxels": nonzero_voxels,
        "total_voxels": total_voxels,
        "volume_fraction": volume_fraction,
        "mean_density": mean_density,
        "max_density": max_density,
        "median_density": median_density,
    }


def _prefix_roi_counts(roi_counts: Dict[str, int], prefix: str) -> Dict[str, int]:
    """Prefix ROI names when aggregating hemisphere-specific outputs."""
    return {f"{prefix}:{name}": count for name, count in roi_counts.items()}


def _read_waytotal(tractography_dir: Path) -> Optional[float]:
    """Read the ``waytotal`` text file produced by ProbTrackX2 or MRtrix3.

    ProbTrackX2 writes an integer streamline count; MRtrix3 writes a float
    (sum of SIFT2 weights). Both round-trip through ``float`` cleanly.

    Args:
        tractography_dir: Tractography output directory (e.g.
            ``probtrackx2/`` or ``mrtrix3/{left,right}/``).

    Returns:
        The waytotal value as a float, or ``None`` if the file is missing
        or unreadable.
    """
    waytotal_file = tractography_dir / "waytotal"
    if not waytotal_file.exists():
        return None
    try:
        text = waytotal_file.read_text().strip()
        return float(text)
    except (ValueError, OSError) as exc:
        logger.debug("Could not read waytotal from {}: {}", waytotal_file, exc)
        return None


def _fdt_paths_stats(fdt_path: Path) -> Dict[str, Any]:
    """Compute summary statistics from an ``fdt_paths.nii.gz`` image.

    Args:
        fdt_path: Path to the track density NIfTI file.

    Returns:
        Dictionary with keys: ``total_streamlines``, ``nonzero_voxels``,
        ``total_voxels``, ``volume_fraction``, ``mean_density``,
        ``max_density``, ``median_density`` (all over non-zero voxels).
    """
    if not NIBABEL_AVAILABLE:
        return {}

    img = nib.load(str(fdt_path))
    data: np.ndarray = np.asarray(img.dataobj)  # type: ignore[attr-defined]

    nonzero_mask = data > 0
    nonzero_vals = data[nonzero_mask]
    total_voxels = int(data.size)
    nonzero_voxels = int(nonzero_vals.size)

    if nonzero_voxels == 0:
        return {
            "total_streamlines": 0,
            "nonzero_voxels": 0,
            "total_voxels": total_voxels,
            "volume_fraction": 0.0,
            "mean_density": 0.0,
            "max_density": 0.0,
            "median_density": 0.0,
        }

    return {
        "total_streamlines": int(np.sum(nonzero_vals)),
        "nonzero_voxels": nonzero_voxels,
        "total_voxels": total_voxels,
        "volume_fraction": round(nonzero_voxels / total_voxels, 6),
        "mean_density": round(float(np.mean(nonzero_vals)), 2),
        "max_density": int(np.max(nonzero_vals)),
        "median_density": round(float(np.median(nonzero_vals)), 2),
    }


def discover_roi_dir_files(tractography_dir: Path) -> Tuple[Optional[Path], List[Path]]:
    """Locate ROI mask files under a tractography output directory.

    Searches ``rois_merged/``, ``rois_transformed/``, and ``rois/``
    (in that order) and returns the first directory that contains at
    least one NIfTI mask, together with the sorted list of mask files
    found there. Callers apply their own per-file name mapping.

    Args:
        tractography_dir: The tractography output directory
            (e.g. ``.../tractography/probtrackx2``).

    Returns:
        Tuple of ``(roi_dir, nii_files)``. ``roi_dir`` is ``None`` and
        ``nii_files`` is empty when no non-empty ROI directory exists.
    """
    search_order = [
        tractography_dir / "rois_merged",
        tractography_dir / "rois_transformed",
        tractography_dir / "rois",
    ]
    for roi_dir in search_order:
        if not roi_dir.is_dir():
            continue
        nii_files = sorted(roi_dir.rglob("*.nii*"))
        if nii_files:
            return roi_dir, nii_files
    return None, []


def _roi_voxel_counts(tractography_dir: Path) -> Dict[str, int]:
    """Count non-zero voxels in each ROI mask.

    Searches ``rois_merged/``, ``rois_transformed/``, and ``rois/``
    (in that order), returning the first non-empty set found.

    Args:
        tractography_dir: The ``probtrackx2/`` output directory.

    Returns:
        Mapping of ROI name to non-zero voxel count.
    """
    if not NIBABEL_AVAILABLE:
        return {}

    roi_dir, nii_files = discover_roi_dir_files(tractography_dir)
    if roi_dir is None:
        return {}

    counts: Dict[str, int] = {}
    for f in nii_files:
        rel = f.relative_to(roi_dir).as_posix()
        name = rel.replace(".nii.gz", "").replace(".nii", "")
        try:
            img = nib.load(str(f))
            data = np.asarray(img.dataobj)  # type: ignore[attr-defined]
            counts[name] = int(np.count_nonzero(data))
        except Exception as exc:
            logger.debug("Could not read ROI {}: {}", f, exc)
            counts[name] = -1
    return counts



[docs]
def collect_tractography_stats(
    patient_output: Union[str, Path],
    patient_id: Optional[str] = None,
    tractography_relpath: str = "tractography/probtrackx2",
    tract_similarity_subdir: str = "tract_similarity",
) -> Dict[str, Any]:
    """Collect tractography statistics for a single subject.

    Reads output files from a tractography run (ProbTrackX2 or MRtrix3)
    and returns a structured dictionary of statistics.

    Args:
        patient_output: Patient-level output directory
            (e.g. ``outputs/114823/``).
        patient_id: Optional patient identifier (included in the
            returned dict for convenience).
        tractography_relpath: Relative path under ``patient_output``
            where the tractography run lives. Defaults to
            ``"tractography/probtrackx2"``. Use
            ``"tractography/mrtrix3"`` for the MRtrix3 backend.
        tract_similarity_subdir: Relative path under ``patient_output``
            where the ``tract_similarity`` / ``tract_similarity_hcp_loo``
            workflow wrote ``metrics.json``. Read opportunistically and
            attached under ``result["tract_similarity"]`` when present.

    Returns:
        Dictionary with keys:

        - ``patient_id`` — subject identifier (or ``"unknown"``).
        - ``waytotal`` — total streamlines reaching all waypoints.
        - ``subject_space`` — fdt_paths statistics in subject space.
        - ``template_space`` — fdt_paths statistics in template space
          (empty dict if warped_streamlines not present).
        - ``roi_voxel_counts`` — non-zero voxel count per ROI mask.
        - ``tract_similarity`` — overlap / correlation / distance_mm /
          distribution / voxel_counts / thresholds blocks copied from
          ``<tract_similarity_subdir>/metrics.json`` (only present when
          that file exists).

    Raises:
        FileNotFoundError: If *patient_output* does not exist.

    Example:
        >>> stats = collect_tractography_stats("outputs/114823")
        >>> stats["waytotal"]
        42351
    """
    out = Path(patient_output)
    if not out.exists():
        raise FileNotFoundError(f"Patient output directory not found: {out}")

    tractography_dir = out / tractography_relpath
    run_dirs = _discover_tractography_dirs(tractography_dir)
    pid = patient_id or out.name

    backend = _detect_backend(tractography_relpath)
    result: Dict[str, Any] = {"patient_id": pid, "backend": backend}

    hemisphere_runs: Dict[str, Dict[str, Any]] = {}
    waytotals: List[float] = []
    subject_stats: List[Dict[str, Any]] = []
    template_stats: List[Dict[str, Any]] = []
    roi_counts_merged: Dict[str, int] = {}
    seed_voxel_count: Dict[str, int] = {}
    target_voxel_count: Dict[str, int] = {}
    per_run_data: List[Dict[str, Any]] = []

    for run_dir in run_dirs:
        run_name = run_dir.name if run_dir != tractography_dir else "combined"
        run_stats: Dict[str, Any] = {}

        waytotal = _read_waytotal(run_dir)
        run_stats["waytotal"] = waytotal
        if waytotal is not None:
            waytotals.append(waytotal)

        subject_fdt = run_dir / "fdt_paths.nii.gz"
        if subject_fdt.exists():
            run_stats["subject_space"] = _fdt_paths_stats(subject_fdt)
            subject_stats.append(run_stats["subject_space"])
        else:
            run_stats["subject_space"] = {}

        template_fdt = run_dir / "warped_streamlines" / "fdt_paths.nii.gz"
        if template_fdt.exists():
            run_stats["template_space"] = _fdt_paths_stats(template_fdt)
            template_stats.append(run_stats["template_space"])
        else:
            run_stats["template_space"] = {}

        run_roi_counts = _roi_voxel_counts(run_dir)
        run_seed_voxels = _classify_roi_voxel_count(run_roi_counts, "seed")
        run_target_voxels = _classify_roi_voxel_count(run_roi_counts, "target")

        if len(run_dirs) > 1:
            run_stats["roi_voxel_counts"] = _prefix_roi_counts(run_roi_counts, run_name)
            roi_counts_merged.update(run_stats["roi_voxel_counts"])
        else:
            run_stats["roi_voxel_counts"] = run_roi_counts
            roi_counts_merged.update(run_roi_counts)

        # Per-target connectivity stats for the run (needed for NCS).
        run_conn_stats: Dict[str, Dict[str, Any]] = {}
        try:
            from thesis.workflows.qc.checks import collect_connectivity_map_stats

            run_conn_stats = collect_connectivity_map_stats(run_dir)
        except Exception:
            run_conn_stats = {}
        if run_conn_stats:
            run_stats["connectivity_maps"] = run_conn_stats

        # Per-run params and SIFT2 mu.
        run_params = _read_tractography_params(run_dir)
        if run_params:
            run_stats["tractography_params"] = run_params
        run_mu = _read_sift2_mu_file(run_dir)
        if run_mu is not None:
            run_stats.setdefault("tractography_params", {})["mu"] = run_mu

        per_run_data.append(
            {
                "name": run_name,
                "stats": run_stats,
                "seed_voxels": run_seed_voxels,
                "target_voxels": run_target_voxels,
                "conn": run_conn_stats,
                "params": run_stats.get("tractography_params", {}),
            }
        )
        if run_seed_voxels:
            seed_voxel_count[run_name] = run_seed_voxels
        if run_target_voxels:
            target_voxel_count[run_name] = run_target_voxels

        if len(run_dirs) > 1:
            hemisphere_runs[run_name] = run_stats

    result["waytotal"] = sum(waytotals) if waytotals else None
    result["subject_space"] = _merge_space_stats(subject_stats)
    result["template_space"] = _merge_space_stats(template_stats)
    result["roi_voxel_counts"] = roi_counts_merged
    if hemisphere_runs:
        result["hemisphere_runs"] = hemisphere_runs

    # Combined connectivity map stats (fall back to the first run when
    # only the per-hemisphere directories carry seeds_to_* files). In the
    # single-run case the combined target directory is the run directory
    # itself, so reuse the per-run result instead of re-loading and
    # re-summing every connectivity volume a second time.
    try:
        if len(run_dirs) == 1 and per_run_data:
            conn_stats = per_run_data[0]["conn"]
        else:
            from thesis.workflows.qc.checks import collect_connectivity_map_stats

            conn_target_dir = tractography_dir if len(run_dirs) == 1 else run_dirs[0]
            conn_stats = collect_connectivity_map_stats(conn_target_dir)
        if conn_stats:
            result["connectivity_maps"] = conn_stats
    except Exception:
        pass

    # Aggregate seed/target voxel counts.
    if seed_voxel_count or target_voxel_count:
        combined_seed = sum(seed_voxel_count.values())
        combined_target = sum(target_voxel_count.values())
        seeds_payload: Dict[str, int] = dict(seed_voxel_count)
        targets_payload: Dict[str, int] = dict(target_voxel_count)
        seeds_payload["combined"] = combined_seed
        targets_payload["combined"] = combined_target
        result["seed_voxel_count"] = seeds_payload
        result["target_voxel_count"] = targets_payload

    # Top-level tractography_params: prefer the first run's params and
    # merge any per-run mu if missing at top level.
    top_params: Dict[str, Any] = {}
    for entry in per_run_data:
        for key, val in (entry["params"] or {}).items():
            if key == "mu":
                continue  # mu is per-hemisphere; keep at run level
            top_params.setdefault(key, val)
    # Promote the first available mu only when there's a single run, to
    # avoid implying that two hemispheres share the same mu (they don't).
    if len(per_run_data) == 1 and "mu" in (per_run_data[0]["params"] or {}):
        top_params["mu"] = per_run_data[0]["params"]["mu"]
    if top_params:
        result["tractography_params"] = top_params

    # NCS — per-run (left/right) and combined.
    n_samples = top_params.get("n_samples") if backend == "probtrackx2" else None
    ncs: Dict[str, Optional[float]] = {}

    def _target_streamlines(conn: Dict[str, Dict[str, Any]]) -> Optional[float]:
        if not conn:
            return None
        # Prefer the canonical "target_dwi" key; otherwise sum all entries.
        if "target_dwi" in conn:
            return float(conn["target_dwi"].get("total_streamlines", 0) or 0)
        total = 0.0
        for entry in conn.values():
            total += float(entry.get("total_streamlines", 0) or 0)
        return total if total > 0 else None

    combined_target_streamlines: float = 0.0
    have_combined_target = False
    for entry in per_run_data:
        run_ts = _target_streamlines(entry["conn"])
        if run_ts is None:
            continue
        have_combined_target = True
        combined_target_streamlines += run_ts
        run_mu = (entry["params"] or {}).get("mu")
        run_n = (entry["params"] or {}).get("n_samples", n_samples)
        run_ncs = _compute_ncs(
            backend=backend,
            target_streamlines=run_ts,
            seed_voxels=int(entry["seed_voxels"]),
            n_samples=run_n,
            mu=run_mu,
        )
        if run_ncs is not None and entry["name"] != "combined":
            ncs[entry["name"]] = run_ncs

    # Combined NCS — recompute on aggregate seed voxels and aggregate
    # target streamlines using the top-level mu (single run) or the
    # per-run mu when ambiguous (defer in that case).
    if have_combined_target:
        combined_seed = seed_voxel_count.get("combined") or sum(seed_voxel_count.values())
        combined_mu: Optional[float]
        if backend == "mrtrix3":
            # Use top-level mu when available (single run); otherwise
            # leave combined NCS blank — averaging mu across hemispheres
            # would lose the per-run cross-sectional-area meaning.
            combined_mu = top_params.get("mu")
        else:
            combined_mu = None
        combined_ncs = _compute_ncs(
            backend=backend,
            target_streamlines=combined_target_streamlines,
            seed_voxels=int(combined_seed),
            n_samples=n_samples,
            mu=combined_mu,
        )
        if combined_ncs is not None:
            ncs["combined"] = combined_ncs
        elif backend == "mrtrix3":
            left_ncs = ncs.get("left")
            right_ncs = ncs.get("right")
            if left_ncs is not None and right_ncs is not None:
                # MRtrix3 FBC is additive across disjoint hemispheres; sum
                # the per-run NCS when each hemisphere has its own mu.
                ncs["combined"] = left_ncs + right_ncs

    if ncs:
        result["ncs"] = ncs

    # SynthSeg QC and volume stats
    try:
        from thesis.workflows.qc.checks import parse_synthseg_qc_csv, parse_synthseg_volumes_csv

        synthseg_qc = parse_synthseg_qc_csv(out)
        if synthseg_qc:
            result["synthseg_qc"] = synthseg_qc
        synthseg_vols = parse_synthseg_volumes_csv(out)
        if synthseg_vols:
            result["synthseg_volumes"] = synthseg_vols
    except Exception:
        pass

    # tract_similarity metrics — opportunistic: attached when the workflow
    # has been run for this subject. Copies only the metric content (not
    # the duplicated patient_id) so the per-subject dict stays flat.
    ts_file = out / tract_similarity_subdir / "metrics.json"
    if ts_file.is_file():
        try:
            ts_data = json.loads(ts_file.read_text(encoding="utf-8"))
            ts_block = {
                key: ts_data.get(key)
                for key in (
                    "overlap",
                    "correlation",
                    "distance_mm",
                    "distribution",
                    "voxel_counts",
                    "thresholds",
                )
                if ts_data.get(key) is not None
            }
            if ts_block:
                result["tract_similarity"] = ts_block
        except (json.JSONDecodeError, OSError) as exc:
            logger.debug("Could not parse {}: {}", ts_file, exc)

    logger.debug("Collected stats for {}: waytotal={}", pid, result["waytotal"])
    return result




[docs]
def collect_batch_stats(
    output_base: Union[str, Path],
    patient_ids: Optional[List[str]] = None,
    tractography_relpath: str = "tractography/probtrackx2",
    tract_similarity_subdir: str = "tract_similarity",
) -> List[Dict[str, Any]]:
    """Collect tractography statistics for multiple subjects.

    If *patient_ids* is ``None``, discovers subject directories
    automatically by looking for subdirectories of *output_base*
    that contain ``<tractography_relpath>/``.

    Args:
        output_base: Parent directory containing per-subject output
            directories (e.g. ``outputs/``).
        patient_ids: Explicit list of subject IDs.  If ``None``,
            auto-discovers from *output_base*.
        tractography_relpath: Relative path under each patient
            directory where the tractography run lives. Defaults to
            ``"tractography/probtrackx2"``. Use
            ``"tractography/mrtrix3"`` for the MRtrix3 backend.
        tract_similarity_subdir: Relative path under each patient
            directory where ``tract_similarity`` /
            ``tract_similarity_hcp_loo`` wrote ``metrics.json``. When
            present, that file's content is attached under
            ``subject["tract_similarity"]``.

    Returns:
        List of per-subject statistics dictionaries (same format
        as :func:`collect_tractography_stats`).

    Example:
        >>> all_stats = collect_batch_stats("outputs/")
        >>> for s in all_stats:
        ...     print(s["patient_id"], s["waytotal"])
    """
    base = Path(output_base)

    if patient_ids is None:
        # Auto-discover subjects
        patient_ids = []
        if base.is_dir():
            for child in sorted(base.iterdir()):
                if child.is_dir() and (child / tractography_relpath).is_dir():
                    patient_ids.append(child.name)

    results: List[Dict[str, Any]] = []
    for pid in patient_ids:
        patient_dir = base / pid
        if not patient_dir.exists():
            logger.warning("Patient directory not found: {}", patient_dir)
            continue
        try:
            stats = collect_tractography_stats(
                patient_dir,
                patient_id=pid,
                tractography_relpath=tractography_relpath,
                tract_similarity_subdir=tract_similarity_subdir,
            )
            results.append(stats)
        except Exception as exc:
            logger.warning("Failed to collect stats for {}: {}", pid, exc)

    return results




[docs]
def format_stats_table(
    stats_list: List[Dict[str, Any]],
    include_roi_counts: bool = False,
) -> str:
    """Format tractography statistics as a human-readable text table.

    Args:
        stats_list: List of per-subject statistics dictionaries
            (from :func:`collect_tractography_stats` or
            :func:`collect_batch_stats`).
        include_roi_counts: If ``True``, append per-ROI voxel
            counts below the main table.

    Returns:
        Formatted multi-line string suitable for terminal output.

    Example:
        >>> print(format_stats_table(all_stats))
        Patient    Waytotal   Nonzero Vox  Mean Dens  Max Dens  Vol Frac
        -------    --------   -----------  ---------  --------  --------
        114823       42351         12834      3.30       287    0.0128
    """
    if not stats_list:
        return "No statistics available."

    def _row_label(patient_id: str, hemisphere: Optional[str] = None) -> str:
        return f"{patient_id}:{hemisphere}" if hemisphere else patient_id

    display_rows: List[Dict[str, Any]] = []
    for stats in stats_list:
        display_rows.append(stats)
        hemisphere_runs = stats.get("hemisphere_runs", {})
        for hemisphere, hemi_stats in hemisphere_runs.items():
            display_rows.append(
                {
                    "patient_id": _row_label(str(stats.get("patient_id", "?")), hemisphere),
                    "waytotal": hemi_stats.get("waytotal"),
                    "subject_space": hemi_stats.get("subject_space", {}),
                    "roi_voxel_counts": hemi_stats.get("roi_voxel_counts", {}),
                    "template_space": hemi_stats.get("template_space", {}),
                }
            )

    # Header
    lines: List[str] = []
    header = (
        f"{'Patient':<12} {'Waytotal':>10} {'Nonzero Vox':>12} "
        f"{'Mean Dens':>10} {'Max Dens':>9} {'Vol Frac':>9}"
    )
    separator = f"{'─' * 12} {'─' * 10} {'─' * 12} " f"{'─' * 10} {'─' * 9} {'─' * 9}"
    lines.append(header)
    lines.append(separator)

    # Per-subject rows
    for stats in display_rows:
        pid = stats.get("patient_id", "?")
        waytotal = stats.get("waytotal")
        ss = stats.get("subject_space", {})

        wt_str = str(waytotal) if waytotal is not None else "n/a"
        nz_str = str(ss.get("nonzero_voxels", "n/a"))
        mean_str = f"{ss.get('mean_density', 0.0):.2f}" if ss else "n/a"
        max_str = str(ss.get("max_density", "n/a"))
        vf_str = f"{ss.get('volume_fraction', 0.0):.4f}" if ss else "n/a"

        lines.append(
            f"{pid:<12} {wt_str:>10} {nz_str:>12} " f"{mean_str:>10} {max_str:>9} {vf_str:>9}"
        )

    # Batch summary line
    if len(stats_list) > 1:
        waytotals = [s["waytotal"] for s in stats_list if s.get("waytotal") is not None]
        if waytotals:
            lines.append(separator)
            mean_wt = int(np.mean(waytotals))
            std_wt = int(np.std(waytotals))
            min_wt = min(waytotals)
            max_wt = max(waytotals)
            lines.append(
                f"{'Summary':<12} {'':>10} "
                f"{'n=' + str(len(waytotals)):>12} {'':>10} {'':>9} {'':>9}"
            )
            lines.append(
                f"{'  mean±std':<12} {mean_wt:>10} {'±':>12} " f"{std_wt:>10} {'':>9} {'':>9}"
            )
            lines.append(f"{'  range':<12} {min_wt:>10} {'→':>12} " f"{max_wt:>10} {'':>9} {'':>9}")

    # Optional ROI counts
    if include_roi_counts:
        lines.append("")
        lines.append("ROI voxel counts (non-zero):")
        # Collect all ROI names across subjects
        all_roi_names: List[str] = []
        for stats in display_rows:
            for name in stats.get("roi_voxel_counts", {}):
                if name not in all_roi_names:
                    all_roi_names.append(name)

        if all_roi_names:
            roi_header = f"  {'ROI':<30}"
            for stats in display_rows:
                pid = stats.get("patient_id", "?")
                roi_header += f" {pid:>10}"
            lines.append(roi_header)
            lines.append(f"  {'─' * 30}" + "".join(f" {'─' * 10}" for _ in display_rows))

            for roi_name in all_roi_names:
                row = f"  {roi_name:<30}"
                for stats in display_rows:
                    count = stats.get("roi_voxel_counts", {}).get(roi_name)
                    row += f" {count if count is not None else 'n/a':>10}"
                lines.append(row)
        else:
            lines.append("  (no ROI masks found)")

    return "\n".join(lines)