Source code for whisper_smith.diarize

import os
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Any, Protocol

from whisper_smith.models import DiarizationResult, DiarizationSegment

DEFAULT_DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"


class PyannotePipeline(Protocol):
    def __call__(self, audio_path: str, **kwargs: Any) -> Any: ...


def _load_pyannote_pipeline_class() -> Any:
    try:
        from pyannote.audio import Pipeline
    except ImportError as error:
        raise RuntimeError(
            "Speaker diarization requires pyannote.audio. "
            "Install it with 'uv sync --extra diarize' or "
            "'pip install whisper-smith[diarize]'."
        ) from error
    except AttributeError as error:
        if "AudioMetaData" not in str(error):
            raise
        raise RuntimeError(
            "Speaker diarization dependency versions are incompatible: "
            "pyannote.audio expects a torchaudio version with AudioMetaData. "
            "Run 'uv lock --upgrade-package torch --upgrade-package torchaudio' "
            "and then 'uv sync --extra diarize'."
        ) from error

    return Pipeline


def _allow_trusted_pyannote_checkpoint_globals() -> None:
    try:
        import torch
        from pyannote.audio.core.task import Problem, Resolution, Scope, Specifications
        from torch.torch_version import TorchVersion
    except ImportError:
        return

    add_safe_globals = getattr(torch.serialization, "add_safe_globals", None)
    if add_safe_globals is None:
        return

    add_safe_globals([TorchVersion, Specifications, Problem, Resolution, Scope])


def _resolve_ffmpeg_executable() -> str:
    ffmpeg_path = shutil.which("ffmpeg")
    if ffmpeg_path:
        return ffmpeg_path

    try:
        import imageio_ffmpeg
    except ImportError as error:
        raise RuntimeError(
            "Speaker diarization requires ffmpeg for media conversion. "
            "Install system ffmpeg or add the 'imageio-ffmpeg' package."
        ) from error

    return imageio_ffmpeg.get_ffmpeg_exe()


def _convert_audio_for_diarization(path: Path, output_dir: Path) -> Path:
    wav_path = output_dir / f"{path.stem}.diarization.wav"
    command = [
        _resolve_ffmpeg_executable(),
        "-hide_banner",
        "-loglevel",
        "error",
        "-i",
        str(path),
        "-vn",
        "-ac",
        "1",
        "-ar",
        "16000",
        "-y",
        str(wav_path),
    ]
    subprocess.run(command, check=True)
    return wav_path


def _resolve_hf_token(hf_token: str | None) -> str | None:
    return (
        hf_token
        or os.getenv("HUGGINGFACE_TOKEN")
        or os.getenv("PYANNOTE_AUTH_TOKEN")
    )


[docs] def from_pyannote_output(output: Any) -> DiarizationResult: diarization = ( getattr(output, "exclusive_speaker_diarization", None) or getattr(output, "speaker_diarization", None) or output ) if not hasattr(diarization, "itertracks"): raise TypeError("Unsupported pyannote diarization output.") segments: list[DiarizationSegment] = [] for turn, _track, speaker in diarization.itertracks(yield_label=True): segments.append( DiarizationSegment( start=float(turn.start), end=float(turn.end), speaker=str(speaker), ) ) return DiarizationResult(segments=segments)
[docs] def diarize_audio( audio_path: str | Path, *, hf_token: str | None = None, model: str = DEFAULT_DIARIZATION_MODEL, num_speakers: int | None = None, min_speakers: int | None = None, max_speakers: int | None = None, pipeline: PyannotePipeline | None = None, ) -> DiarizationResult: path = Path(audio_path) if not path.is_file(): raise FileNotFoundError(f"Audio file not found: {path}") diarization_pipeline = pipeline if diarization_pipeline is None: token = _resolve_hf_token(hf_token) if not token: raise RuntimeError( "Hugging Face token not found. Set HUGGINGFACE_TOKEN in your " "environment or .env file, or pass hf_token explicitly." ) Pipeline = _load_pyannote_pipeline_class() _allow_trusted_pyannote_checkpoint_globals() try: diarization_pipeline = Pipeline.from_pretrained(model, token=token) except TypeError as error: if "token" not in str(error): raise diarization_pipeline = Pipeline.from_pretrained( model, use_auth_token=token, ) if diarization_pipeline is None: raise RuntimeError( f"Could not load pyannote diarization model: {model}. " "Check that HUGGINGFACE_TOKEN is valid and that you accepted the " "model's Hugging Face user conditions." ) pipeline_kwargs: dict[str, int] = {} if num_speakers is not None: pipeline_kwargs["num_speakers"] = num_speakers if min_speakers is not None: pipeline_kwargs["min_speakers"] = min_speakers if max_speakers is not None: pipeline_kwargs["max_speakers"] = max_speakers try: with tempfile.TemporaryDirectory(prefix="whisper_smith_diarize_") as temp_dir: diarization_path = _convert_audio_for_diarization(path, Path(temp_dir)) output = diarization_pipeline(str(diarization_path), **pipeline_kwargs) return from_pyannote_output(output) except subprocess.CalledProcessError as error: raise RuntimeError( "Failed to prepare audio for speaker diarization. The file may be malformed." ) from error
[docs] def diarize_file( audio_path: str | Path, *, hf_token: str | None = None, model: str = DEFAULT_DIARIZATION_MODEL, num_speakers: int | None = None, min_speakers: int | None = None, max_speakers: int | None = None, ) -> DiarizationResult: return diarize_audio( audio_path, hf_token=hf_token, model=model, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers, )