Add OpenAI-compatible API and Docker deployment

- Add FastAPI-based API in whisperx/api/ - Implement transcription endpoint compatible with OpenAI - Added Dockerfile and docker-compose.yml for easy deployment - Updated README with Docker instructions - Added new script whisperx-serve for running the API
2026-05-13 01:37:47 +03:00
parent d154f4b39b
commit c1fcb3f57c
8 changed files with 238 additions and 1 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,27 @@
 .git
 __pycache__
 *.pyc
 *.pyo
 *.pyd
 .Python
 .pytest_cache
 .coverage
 htmlcov
 .env
 .venv
 venv/
 ENV/
 env/
 # Docker
 Dockerfile*
 docker-compose*.yml
 .dockerignore
 # IDE
 .vscode
 .idea
 # OS
 .DS_Store
 *.log
--- a/36
+++ b/36
@@ -0,0 +1,36 @@
 # Use ROCm PyTorch base image
 FROM rocm/pytorch:latest
 # Set environment variables for ROCm and Python
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONPATH=/app
 ENV HF_HOME=/app/.cache/huggingface
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    build-essential \
    ffmpeg \
    libsndfile1 \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
 # Copy project files
 COPY . .
 # Install Python dependencies
 RUN pip install --upgrade pip
 RUN pip install -e .
 # Expose port
 EXPOSE 8000
 # Set default environment variables
 ENV WHISPERX_MODEL=turbo
 ENV WHISPERX_DEVICE=cuda
 ENV WHISPERX_COMPUTE_TYPE=float16
 # Start the server
 CMD ["python", "-m", "whisperx.api.serve"]
--- a/README.md
+++ b/README.md
@@ -111,6 +111,46 @@ uv sync --all-extras --dev
 You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
 ## Docker Deployment 🐳
 For easy deployment with GPU support, use Docker Compose:
 ### Prerequisites
 - Docker and Docker Compose installed
 - ROCm compatible GPU (AMD) or NVIDIA GPU with CUDA
 - For AMD ROCm, ensure ROCm drivers are installed on host
 ### Steps
 1. Clone the repository:
 ```bash
 git clone https://github.com/m-bain/whisperX.git
 cd whisperX
 ```
 2. Build and run the container:
 ```bash
 docker-compose up --build
 ```
 The API will be available at `http://localhost:8000`
 ### Environment Variables
 - `WHISPERX_MODEL`: Model size (default: large-v2)
 - `WHISPERX_DEVICE`: cuda or cpu (default: cuda)
 - `WHISPERX_COMPUTE_TYPE`: float16 or float32 (default: float16)
 ### API Usage
 The API is compatible with OpenAI's transcription endpoint:
 ```bash
 curl -X POST http://localhost:8000/v1/audio/transcriptions \
  -H "Content-Type: multipart/form-data" \
  -F "file=@audio.wav" \
  -F "model=whisper-1" \
  -F "language=en"
 ```
 ### Speaker Diarization
 To **enable Speaker Diarization**, include your Hugging Face access token (read) that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation-3.0) and [Speaker-Diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) (if you choose to use Speaker-Diarization 2.x, follow requirements [here](https://huggingface.co/pyannote/speaker-diarization) instead.)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,28 @@
 version: '3.8'
 services:
  whisperx-api:
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      - "8000:8000"
    environment:
      - WHISPERX_MODEL=turbo
      - WHISPERX_DEVICE=cuda
      - WHISPERX_COMPUTE_TYPE=float16
    volumes:
      # Mount Hugging Face cache if needed
      - hf_cache:/app/.cache/huggingface
    devices:
      # Allow access to all GPUs
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    cap_add:
      - SYS_ADMIN
    security_opt:
      - seccomp:unconfined
    # For AMD ROCm GPUs, use device passthrough
 volumes:
  hf_cache:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,12 +22,16 @@ dependencies = [
    "torch~=2.8.0",
    "torchaudio~=2.8.0",
    "transformers>=4.48.0",
-    "triton>=3.3.0; sys_platform == 'linux' and platform_machine == 'x86_64'" # only install triton on x86_64 Linux
+    "triton>=3.3.0; sys_platform == 'linux' and platform_machine == 'x86_64'", # only install triton on x86_64 Linux
    "fastapi>=0.104.0",
    "uvicorn[standard]>=0.24.0",
    "python-multipart>=0.0.6",
 ]
 [project.scripts]
 whisperx = "whisperx.__main__:cli"
 whisperx-serve = "whisperx.api.serve:serve"
 [build-system]
 requires = ["setuptools"]
--- a/whisperx/api/init.py
+++ b/whisperx/api/init.py
--- a/whisperx/api/main.py
+++ b/whisperx/api/main.py
@@ -0,0 +1,86 @@
 import os
 import tempfile
 import asyncio
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
 import torch
 import whisperx
 from whisperx.schema import TranscriptionResult
 model = None
 align_model_metadata = None
 def load_transcription_model(model_name: str = "turbo", device: str = None, compute_type: str = "float16"):
    global model, align_model_metadata
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Loading WhisperX model: {model_name} on {device} with {compute_type}")
    model = whisperx.load_model(model_name, device, compute_type=compute_type)
    # For alignment, load the metadata
    align_model_metadata = whisperx.alignment.DEFAULT_ALIGN_MODELS_HF
    print("Model loaded and ready.")
@asynccontextmanager
 async def lifespan(app: FastAPI):
    # Load the model at startup
    model_name = os.getenv("WHISPERX_MODEL", "turbo")
    device = os.getenv("WHISPERX_DEVICE", "cuda")
    compute_type = os.getenv("WHISPERX_COMPUTE_TYPE", "float16")
    load_transcription_model(model_name, device, compute_type)
    yield
    # Cleanup if needed
    print("Shutting down API")
 app = FastAPI(
    title="WhisperX API",
    description="OpenAI-compatible API for speech transcription using WhisperX",
    version="1.0.0",
    lifespan=lifespan
 )
@app.get("/")
 async def root():
    return {"message": "WhisperX API is running"}
@app.post("/v1/audio/transcriptions")
 async def transcribe_audio(
    file: UploadFile = File(...),
    model_name: str = Form("whisper-1"),  # OpenAI uses 'whisper-1', we ignore this
    language: str = Form(None),
    response_format: str = Form("json"),
    temperature: float = Form(0.0),  # We don't use temperature for now
    prompt: str = Form(None)  # Not used
 ):
    if model is None:
        raise HTTPException(status_code=500, detail="Model not loaded")
    if not file.filename.lower().endswith(('.wav', '.mp3', '.flac', '.m4a', '.webm', '.mp4', '.mpga', '.ogg', '.opus')):
        raise HTTPException(status_code=400, detail="Unsupported audio format")
    # Save uploaded file to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
        temp_file.write(await file.read())
        audio_path = temp_file.name
    try:
        # Load audio
        audio = whisperx.load_audio(audio_path)
        # Transcribe
        result = model(audio, batch_size=16, language=language)
        text = " ".join([segment['text'] for segment in result["segments"]]).strip()
        # If we have segments, might want to return more info, but for OpenAI compatibility, just text
        return JSONResponse({"text": text})
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
    finally:
        os.unlink(audio_path)
--- a/whisperx/api/serve.py
+++ b/whisperx/api/serve.py
@@ -0,0 +1,16 @@
 import uvicorn
 def serve(host: str = "0.0.0.0", port: int = 8000, workers: int = 1):
    """Run the WhisperX API server"""
    uvicorn.run(
        "whisperx.api.main:app",
        host=host,
        port=port,
        workers=workers,
        reload=False  # No reload for production
    )
 if __name__ == "__main__":
    serve()