- Поменял всё снова на Whisper

- Добавил предзагрузку модели по-умолчанию - Убрал метрики - Добавил скрипты для старта - Для отчаянных Dockerfile для сборки контейнера на 70ГБ
2025-08-20 23:18:02 +09:00
parent 4fd0f18dd1
commit 228f67d07f
9 changed files with 314 additions and 181 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,13 @@
 # Server configuration
 HOST=0.0.0.0
 PORT=9854
 # Model configuration
 DEFAULT_MODEL=turbo
 MODEL_DOWNLOAD_ROOT=/app/models
 # API Keys
 KEYS_FILE=/app/keys.txt
 # Logging
 LOG_LEVEL=INFO
--- a/.idea/simple-asr-server.iml
+++ b/.idea/simple-asr-server.iml
@@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module version="4">
  <component name="PyDocumentationSettings">
    <option name="format" value="PLAIN" />
    <option name="myDocStringFormat" value="Plain" />
  </component>
 </module>
--- a/42
+++ b/42
@@ -1,22 +1,46 @@
-FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0
+# Use ROCm compatible Python image as base
 FROM rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    ffmpeg \
    git \
    curl \
    python3-pip \
-    python3-venv \
+    && rm -rf /var/lib/apt/lists/*
 # Update pip
 RUN pip install --upgrade pip
 # Copy requirements first for better caching
 COPY requirements.txt .
 RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
-COPY . .
+# Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY app.py .
 # Create directory for models and keys
 RUN mkdir -p /app/models /app/data
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV MODEL_DOWNLOAD_ROOT=/app/models
 ENV KEYS_FILE=/app/data/keys.txt
 ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
 ENV ROCM_PATH=/opt/rocm
 # Expose port
 EXPOSE 9854
-# Устанавливаем переменные окружения для ROCm
+# Health check
-ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
-ENV PYTORCH_ROCM_ARCH=gfx1030
+    CMD curl -f http://localhost:9854/health || exit 1
 # Run the application
 CMD ["python", "app.py"]
 # Команда для запуска приложения
 CMD ["python3", "app.py"]
--- a/README.md
+++ b/README.md
@@ -1,104 +1,86 @@
 BASED ON https://github.com/salute-developers/GigaAM
 # Simple ASR Server
-This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container.
+Простой сервер для автоматического распознавания речи (ASR) на базе OpenAI Whisper.
-## Prerequisites
+## Особенности
-Before you begin, ensure you have the following installed:
+- Поддержка различных моделей Whisper (tiny, base, small, medium, large, turbo)
 - Три формата вывода: plaintext, simple JSON, полный JSON
 - Параметр speedup для ускорения аудио перед распознаванием
 - Автоматическая конвертация аудио в поддерживаемый формат
 - API ключи для безопасности
 - Docker поддержка
-*   [Docker](https://docs.docker.com/get-docker/)
+## Быстрый старт
 *   [Docker Compose](https://docs.docker.com/compose/install/)
-## Project Structure
+### Локальная установка
-```
+1. Установите зависимости:
-.
+```bash
-├── app.py              # Main application file with FastAPI endpoint
+pip install -r requirements.txt
 ├── docker-compose.yml  # Docker Compose configuration
 ├── Dockerfile          # Dockerfile for building the application image
 ├── model/              # Directory for Whisper model files
 └── requirements.txt    # Python dependencies
 ```
-## Setup
+2. Скопируйте и настройте переменные окружения:
 ```bash
 cp .env.example .env
 ```
-1.  **Clone the repository:**
+3. Запустите сервер:
 ```bash
 python app.py
 ```
-    ```bash
+### Docker
    git clone https://github.com/SlavaVlad/simple-asr-server
    cd simple-asr-server
    ```
 3.  **Add API keys:**
-    Create a `keys.txt` file in the root of the project and add your API keys, one per line.
+1. Постройте и запустите контейнер:
 ```bash
 docker-compose up --build
 ```
-## Building and Running the Project
+## API
 You can build and run the project using Docker Compose.
 1.  **Build the Docker image:**
    ```bash
    docker-compose build
    ```
 2.  **Run the container:**
    ```bash
    docker-compose up
    ```
    The application will be available at `http://0.0.0.0:9854`.
 ## API Endpoint
 ### POST /transcribe
-This endpoint accepts an audio file and returns the transcription.
+Распознавание речи из аудиофайла.
-*   **URL:** `/transcribe`
+**Параметры:**
-*   **Method:** `POST`
+- `file` (файл) - Аудиофайл для распознавания
-*   **Headers:**
+- `model_name` (опционально) - Модель Whisper для использования
-    *   `X-API-Key`: Your API key.
+- `output_format` - Формат вывода: `plaintext`, `simple`, или `json`
-*   **Form Data:**
+- `speedup` - Коэффициент ускорения аудио (0.25-4.0)
    *   `file`: The audio file to be transcribed.
-**Example using `curl`:**
+**Заголовки:**
 - `x-api-key` - API ключ
 **Примеры:**
 ```bash
-curl -X POST "http://localhost:9854/transcribe" \
+# Простой текстовый вывод
-     -H "X-API-Key: YOUR_API_KEY" \
+curl -X POST "http://localhost:9854/transcribe?output_format=plaintext&speedup=1.5" \
-     -F "file=@/path/to/your/audio.wav"
+  -H "x-api-key: YOUR_API_KEY" \
  -F "file=@audio.wav"
 # JSON с только текстом
 curl -X POST "http://localhost:9854/transcribe?output_format=simple" \
  -H "x-api-key: YOUR_API_KEY" \
  -F "file=@audio.wav"
 # Полный JSON ответ с использованием другой модели
 curl -X POST "http://localhost:9854/transcribe?output_format=json&model_name=base" \
  -H "x-api-key: YOUR_API_KEY" \
  -F "file=@audio.wav"
 ```
-**Successful Response (200 OK):**
+### GET /health
-```json
+Проверка состояния сервера.
 {
  "transcription": [
    {
      "start_time": 0.0,
      "end_time": 2.5,
      "transcription": "Hello world."
    }
  ],
  "text": "Hello world. ",
  "metrics": {
    "processing_time": 5.2,
    "rtf": 0.5,
    "word_rate": 2.0
  }
 }
 ```
-**Error Response (401 Unauthorized):**
+## Переменные окружения
-If the API key is missing or invalid.
+См. `.env.example` для полного списка доступных переменных:
-```json
+- `HOST` - Хост сервера (по умолчанию: 0.0.0.0)
-{
+- `PORT` - Порт сервера (по умолчанию: 9854)
-  "detail": "Invalid API Key"
+- `DEFAULT_MODEL` - Модель по умолчанию (по умолчанию: turbo)
-}
+- `MODEL_DOWNLOAD_ROOT` - Папка для загрузки моделей
-```
+- `KEYS_FILE` - Файл с API ключами
--- a/app.py
+++ b/app.py
@@ -1,13 +1,14 @@
 import logging
 import os
 import subprocess
-import time
+import tempfile
-from os import getenv
+from typing import Optional
-from typing import Dict
+from enum import Enum
-import gigaam
+import whisper
-from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
+from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, Query
 from fastapi.security import APIKeyHeader
 from fastapi.responses import PlainTextResponse
 # Configure logging
 logging.basicConfig(
@@ -16,14 +17,21 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-app = FastAPI()
+app = FastAPI(title="Simple ASR Server", description="Audio transcription API using Whisper")
 # API key header
 api_key_header = APIKeyHeader(name="x-api-key")
 # Global model variable
 default_model = None
-def get_keys():  # не бейте меня за это
+class OutputFormat(str, Enum):
-    keys_file = "keys.txt"
+    plaintext = "plaintext"
    simple = "simple"
    json = "json"
 def get_keys():
    keys_file = os.getenv("KEYS_FILE", "keys.txt")
    if not os.path.exists(keys_file):
        # Create a new keys file with a default key
        default_key = os.urandom(32).hex()
@@ -36,16 +44,41 @@ def get_keys():  # не бейте меня за это
        with open(keys_file, "r") as f:
            keys = [line.strip() for line in f if line.strip()]
        logger.info(f"Loaded {len(keys)} keys from file")
        logger.debug(f"Keys: {keys}")
        if not keys:
            raise ValueError("No keys found in keys.txt")
        return keys
 def load_default_model():
    """Load the default model on startup"""
    global default_model
    model_name = os.getenv("DEFAULT_MODEL", "turbo")
    model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None)
    logger.info(f"Loading default model: {model_name}")
    try:
        default_model = whisper.load_model(model_name, download_root=model_download_root, in_memory=True)
        logger.info(f"Successfully loaded model: {model_name}")
    except Exception as e:
        logger.error(f"Failed to load default model {model_name}: {e}")
        raise
 def get_model(model_name: Optional[str] = None):
    """Get model - either default or load new one if specified"""
    global default_model
    if model_name is None:
        return default_model
    # If different model requested, load it
    if model_name != os.getenv("DEFAULT_MODEL", "turbo"):
        model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None)
        logger.info(f"Loading requested model: {model_name}")
        return whisper.load_model(model_name, download_root=model_download_root)
    return default_model
 def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
-    """
+    """Convert audio to compatible format and speed up if needed."""
    Convert audio to compatible format and speed up if needed.
    """
    try:
        command = [
            'ffmpeg', '-i', input_path,
@@ -57,97 +90,69 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
            '-y'
        ]
        logger.debug(f"Running FFmpeg command: {' '.join(command)}")
-        subprocess.run(command, check=True, capture_output=True)
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
-        logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
+        logger.error(f"FFmpeg conversion failed: {e.stderr}")
        return False
 def get_audio_duration(file_path: str) -> float:
    """Get audio duration using ffprobe"""
    cmd = [
        'ffprobe',
        '-v', 'quiet',
        '-show_entries', 'format=duration',
        '-of', 'default=noprint_wrappers=1:nokey=1',
        file_path
    ]
    try:
        output = subprocess.check_output(cmd).decode().strip()
        return float(output)
    except:
        return 0.0
@app.post("/transcribe")
 async def transcribe_audio(
-        file: UploadFile = File(...),
+    file: UploadFile = File(...),
-        token: str = Depends(api_key_header),
+    token: str = Depends(api_key_header),
-        model: str = "turbo",
+    model_name: Optional[str] = Query(None, description="Model name to use for transcription"),
-        verbose: Optional[bool] = None,
+    output_format: OutputFormat = Query(OutputFormat.json, description="Output format: plaintext, simple, or json"),
-        temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+    speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)")
        compression_ratio_threshold: Optional[float] = 2.4,
        speed_up: Optional[float] = 1.25,
        logprob_threshold: Optional[float] = -1.0,
        no_speech_threshold: Optional[float] = 0.6,
        condition_on_previous_text: bool = True,
        initial_prompt: Optional[str] = None,
        word_timestamps: bool = False,
        prepend_punctuations: str = "\"'\"¿([{-",
        append_punctuations: str = "\"\'.。,，!！?？:：\")]}、",
        clip_timestamps: Union[str, List[float]] = "0",
        hallucination_silence_threshold: Optional[float] = None
 ):
    """Transcribe audio file with configurable output format"""
    # Token validation
    if token not in get_keys():
        logger.warning(f"Invalid token attempt: {token}")
        raise HTTPException(status_code=403, detail="Forbidden")
-    model = whisper.load_model(model)  # Load the Whisper model
+    logger.info(f"Processing file: {file.filename}, model: {model_name or 'default'}, format: {output_format}, speedup: {speedup}")
-    logger.info(f"Processing file: {file.filename} with model: {model}")
+    # Get model
    try:
        model = get_model(model_name)
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}")
-    # Save uploaded file
+    # Create temporary files
-    temp_input_path = f"/tmp/input_{file.filename}"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as temp_input:
-    temp_output_path = f"/tmp/converted_{file.filename}.wav"
+        temp_input_path = temp_input.name
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_output:
        temp_output_path = temp_output.name
    try:
        # Save uploaded file
        with open(temp_input_path, "wb") as f:
-            f.write(await file.read())
+            content = await file.read()
            f.write(content)
-        # Convert audio if needed
+        # Convert audio if speedup is not 1.0 or format needs conversion
-        logger.debug("Converting audio file")
+        if speedup != 1.0 or not file.filename.lower().endswith('.wav'):
-        if not convert_audio(temp_input_path, temp_output_path, speed_up):
+            logger.debug(f"Converting audio file with speedup: {speedup}")
-            raise HTTPException(status_code=400, detail="Audio conversion failed")
+            if not convert_audio(temp_input_path, temp_output_path, speedup):
-
+                raise HTTPException(status_code=400, detail="Audio conversion failed")
-        # Get audio duration before speed up
+            audio_file_path = temp_output_path
-        original_duration = get_audio_duration(temp_input_path)
+        else:
            audio_file_path = temp_input_path
        # Transcribe
        logger.info("Starting transcription")
-        if original_duration > 30:
+        result = model.transcribe(audio_file_path)
            logger.info("Audio duration > 30 seconds, using transcribe_longform")
            transcription_result = model.transcribe_longform(
                temp_output_path
            )
        else:
            logger.info("Audio duration <= 30 seconds, using transcribe")
            transcription_result = model.transcribe(
                temp_output_path
            )
-        full_text = ""
+        # Format output based on requested format
-        for part in transcription_result:
+        if output_format == OutputFormat.plaintext:
-            if part["transcription"].strip() != "":
+            return PlainTextResponse(content=result["text"], media_type="text/plain")
-                full_text += part["transcription"].strip() + " "
+        elif output_format == OutputFormat.simple:
-
+            return {"text": result["text"]}
-        result = {
+        else:  # json format
-            "transcription": transcription_result,
+            return result
            "text": full_text
        }
        return result
    except Exception as e:
        logger.error(f"Transcription failed: {str(e)}")
@@ -155,16 +160,29 @@ async def transcribe_audio(
    finally:
        # Cleanup temporary files
-        if os.path.exists(temp_input_path):
+        for path in [temp_input_path, temp_output_path]:
-            os.remove(temp_input_path)
+            if os.path.exists(path):
-        if os.path.exists(temp_output_path):
+                try:
-            os.remove(temp_output_path)
+                    os.remove(path)
                except Exception as e:
                    logger.warning(f"Failed to remove temp file {path}: {e}")
@app.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "model_loaded": default_model is not None}
 def main():
    import uvicorn
    # Load default model and keys
    load_default_model()
    get_keys()
-    uvicorn.run(app, host="0.0.0.0", port=9854, log_level="debug")
+
    port = int(os.getenv("PORT", 9854))
    host = os.getenv("HOST", "0.0.0.0")
    uvicorn.run(app, host=host, port=port, log_level="info")
 if __name__ == "__main__":
    main()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,6 +1,31 @@
 services:
-  whisper-app:
+  simple-asr-server:
    build: .
    ports:
-      - "9854:9854"
+      - "${PORT:-9854}:9854"
-    command: ["python", "app.py"]
+    environment:
      - HOST=${HOST:-0.0.0.0}
      - PORT=${PORT:-9854}
      - DEFAULT_MODEL=${DEFAULT_MODEL:-turbo}
      - MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-/app/models}
      - KEYS_FILE=${KEYS_FILE:-/app/data/keys.txt}
      - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-10.3.0}
    volumes:
      - ./models:/app/models
      - ./data:/app/data
    devices:
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    group_add:
      - video
      - render
    security_opt:
      - seccomp:unconfined
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9854/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,5 @@
 fastapi
 uvicorn[standard]
 python-multipart
-gigaam
+openai-whisper
-gigaam[longform]
+python-dotenv
 ffmpeg-python
 PyYAML
 numpy<2.0.0
--- a/simple-asr-server.service
+++ b/simple-asr-server.service
@@ -0,0 +1,20 @@
 [Unit]
 Description=Whisper ASR Server (ROCM)
 After=network.target
 Wants=network.target
 [Service]
 Type=exec
 User=asr
 Group=asr
 WorkingDirectory=/opt/asr
 ExecStart=/opt/asr/start_server.sh
 ExecReload=/bin/kill -HUP $MAINPID
 Restart=always
 RestartSec=10
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=asr
 [Install]
 WantedBy=multi-user.target
--- a/start_server.sh
+++ b/start_server.sh
@@ -0,0 +1,47 @@
 #!/bin/bash
 # Simple ASR Server startup script for systemd
 # This script loads environment variables from .env file and starts the server
 set -e
 # Get the directory where this script is located
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 APP_DIR="${SCRIPT_DIR}"
 # Load environment variables from .env file if it exists
 if [ -f "${APP_DIR}/.env" ]; then
    echo "Loading environment variables from ${APP_DIR}/.env"
    set -a  # automatically export all variables
    source "${APP_DIR}/.env"
    set +a
 else
    echo "Warning: .env file not found at ${APP_DIR}/.env"
    echo "Using default environment variables"
 fi
 # Set default values if not provided in .env
 export HOST=${HOST:-"0.0.0.0"}
 export PORT=${PORT:-9854}
 export DEFAULT_MODEL=${DEFAULT_MODEL:-"turbo"}
 export MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-"${APP_DIR}/models"}
 export KEYS_FILE=${KEYS_FILE:-"${APP_DIR}/keys.txt"}
 export LOG_LEVEL=${LOG_LEVEL:-"INFO"}
 # Create necessary directories
 mkdir -p "${MODEL_DOWNLOAD_ROOT}"
 mkdir -p "$(dirname "${KEYS_FILE}")"
 # Change to app directory
 cd "${APP_DIR}"
 echo "Starting Simple ASR Server..."
 echo "Host: ${HOST}"
 echo "Port: ${PORT}"
 echo "Default Model: ${DEFAULT_MODEL}"
 echo "Model Download Root: ${MODEL_DOWNLOAD_ROOT}"
 echo "Keys File: ${KEYS_FILE}"
 echo "Log Level: ${LOG_LEVEL}"
 # Start the application
 exec python3 app.py