Source code for ultrasound_metrics.data.uff

"""
UFF dataset utilities for ultrasound data.

This module provides functions for loading UFF datasets that require pyuff_ustb.
If you don't work with UFF files, you can use the general utilities in visualize_bmode.py
without needing to install pyuff_ustb.
"""

from pathlib import Path
from typing import TYPE_CHECKING, Any, TypedDict, cast

from ultrasound_metrics._utils.array_api import ArrayAPIObj
from ultrasound_metrics.data.downloads import cached_download

if TYPE_CHECKING:
    pass

# Check for pyuff_ustb availability at module level
try:
    import pyuff_ustb as uff  # ty: ignore[unresolved-import]  # pyright: ignore[reportMissingImports]
    from pyuff_ustb import (  # ty: ignore[unresolved-import]  # pyright: ignore[reportMissingImports]
        BeamformedData,
        ChannelData,
        Uff,
    )
    from pyuff_ustb.readers.base import (  # ty: ignore[unresolved-import]  # pyright: ignore[reportMissingImports]
        ReaderKeyError,
    )

    _HAS_PYUFF_USTB = True
except ImportError as err:
    if "pyuff_ustb" in str(err):
        # Only raise ImportError if this is the specific missing dependency
        raise ImportError(
            "pyuff_ustb is required. Install with: `uv pip install ultrasound-metrics[uff]` or `pip install pyuff_ustb`"
        ) from err
    else:
        # Re-raise other import errors
        raise


# Default cache directory for downloaded datasets

[docs]
CACHE_DIR = Path.home() / ".cache" / "ultrasound-metrics" / "datasets"




[docs]
class DatasetInfo(TypedDict):
    """Type definition for dataset information."""


[docs]
    url: str


[docs]
    filename: str


[docs]
    description: str


[docs]
    size: int




# Dataset registry with download URLs and metadata

[docs]
USTB_DATASETS: dict[str, DatasetInfo] = {
    "picmus_resolution_experiment": {
        "url": "https://f004.backblazeb2.com/b2api/v1/b2_download_file_by_id?fileId=4_z81bac298ed734da8927d0614_f112a4a231dbce513_d20250729_m192149_c004_v0402004_t0044_u01753816909257",
        # We can also use the USTB URL, but it has rate-limits
        # "url": "http://www.ustb.no/datasets/PICMUS_experiment_resolution_distortion.uff",
        "filename": "PICMUS_experiment_resolution_distortion.uff",
        "description": "PICMUS challenge resolution/distortion test (experiment)",
        "size": 145518524,
    },
    "picmus_contrast_experiment": {
        "url": "https://f004.backblazeb2.com/b2api/v1/b2_download_file_by_id?fileId=4_z81bac298ed734da8927d0614_f100d6106d29bf5da_d20250729_m192144_c004_v0402027_t0027_u01753816904341",
        # We can also use the USTB URL, but it has rate-limits
        # "url": "http://www.ustb.no/datasets/PICMUS_experiment_contrast_speckle.uff",
        "filename": "PICMUS_experiment_contrast_speckle.uff",
        "description": "PICMUS challenge contrast/speckle test (experiment)",
        "size": 145518504,
    },
}



def _check_pyuff_ustb() -> None:
    """
    Check if pyuff_ustb is available and raise ImportError if not.

    Raises
    ------
    ImportError
        If pyuff_ustb is not installed.
    """
    if not _HAS_PYUFF_USTB:
        raise ImportError("pyuff_ustb is required to load UFF datasets. Install with: pip install pyuff_ustb")


def list_available_datasets() -> dict[str, DatasetInfo]:
    """
    List all available datasets with their metadata.

    Returns
    -------
    dict
        Dictionary mapping dataset names to their information.
    """
    return USTB_DATASETS.copy()


def inspect_dataset(dataset_name: str) -> dict:
    """
    Inspect a specific dataset and its cache status.

    Parameters
    ----------
    dataset_name
        Name of the dataset to inspect.

    Returns
    -------
    dict
        Dictionary with dataset metadata and cache information.

    Raises
    ------
    KeyError
        If the dataset is not found.

    Examples
    --------
    >>> info = inspect_dataset("picmus_resolution_experiment")
    >>> print(f"URL: {info['url']}")
    >>> print(f"Cached: {info['cached']}")
    >>> print(f"Size: {info['size']} bytes")
    """
    if dataset_name not in USTB_DATASETS:
        raise KeyError(f"Dataset {dataset_name} not found")

    dataset_info = USTB_DATASETS[dataset_name]
    cached_file = CACHE_DIR / dataset_info["filename"]

    result = {
        "name": dataset_name,
        "url": dataset_info["url"],
        "filename": dataset_info["filename"],
        "description": dataset_info["description"],
        "cached": cached_file.exists(),
        "cache_path": cached_file,
        "size": cached_file.stat().st_size if cached_file.exists() else None,
    }

    return result


def load_dataset(dataset_name: str, download_if_missing: bool = True, key: str = "/beamformed_data") -> ArrayAPIObj:
    """
    Load a dataset using pyuff_ustb.

    Parameters
    ----------
    dataset_name
        Name of the dataset to load.
    download_if_missing
        Whether to download the dataset if not cached.
    key
        Key to read from the UFF file. Common keys include:

        - "/beamformed_data": Beamformed ultrasound data (default)
        - "/channel_data": Channel data for temporal analysis

    Returns
    -------
    ndarray
        The loaded dataset as a numpy array (default key: "/beamformed_data").

    Raises
    ------
    ImportError
        If pyuff_ustb is not installed.
    KeyError
        If the dataset is not found.
    FileNotFoundError
        If dataset is not cached and download_if_missing=False.

    Examples
    --------
    Load beamformed data (default):

    >>> data = load_dataset("picmus_resolution_experiment")
    >>> print(f"Dataset shape: {data.shape}")

    Load channel data for temporal analysis:

    >>> channel_data = load_dataset("picmus_resolution_experiment", key="/channel_data")
    >>> print(f"Channel data shape: {channel_data.shape}")
    """
    # Fail fast if pyuff_ustb is not available
    _check_pyuff_ustb()

    if dataset_name not in USTB_DATASETS:
        raise KeyError(f"Dataset {dataset_name} not found")

    dataset_info = USTB_DATASETS[dataset_name]
    cached_file = CACHE_DIR / dataset_info["filename"]

    # Download if missing and requested
    if download_if_missing:
        cached_download(
            url=dataset_info["url"],
            filename=dataset_info["filename"],
            expected_size=dataset_info["size"],
        )
    elif not cached_file.exists():
        raise FileNotFoundError(f"Dataset {dataset_name} is not cached")

    # Load and return the actual data
    uff_file: Uff = Uff(str(cached_file))
    try:
        key_data = uff_file.read(key)
    except ReaderKeyError as err:
        raise KeyError(f"Key {key} not found in dataset {dataset_name}") from err

    if not hasattr(key_data, "data"):
        raise ValueError(f"Key {key} does not contain data")

    return cast(BeamformedData | ChannelData, key_data).data


def load_uff_dataset(dataset_name: str) -> tuple[ArrayAPIObj, Any]:
    """
    Load and reshape any UFF dataset.

    Parameters
    ----------
    dataset_name
        Name of the dataset to load.

    Returns
    -------
    tuple
        Tuple of (beamformed_image, scan_info).
    """
    # Get dataset information and download if not cached
    dataset_info = inspect_dataset(dataset_name)
    data_file = cached_download(
        url=dataset_info["url"],
        filename=dataset_info["filename"],
        expected_size=dataset_info.get("size"),
    )

    # Load using UFF
    uff_file = uff.Uff(str(data_file))
    beamformed_data = uff_file.read("/beamformed_data")
    beamformed_data: BeamformedData = cast(BeamformedData, beamformed_data)

    # Get scan geometry
    scan = beamformed_data.scan

    # Reshape data using scan geometry
    beamformed_image = beamformed_data.data.reshape(
        (scan.x_axis.size, scan.z_axis.size),
    )

    return beamformed_image, scan