- Поменял всё снова на Whisper

- Добавил предзагрузку модели по-умолчанию - Убрал метрики - Добавил скрипты для старта - Для отчаянных Dockerfile для сборки контейнера на 70ГБ
2025-08-20 23:18:02 +09:00
parent 4fd0f18dd1
commit 228f67d07f
9 changed files with 314 additions and 181 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,13 @@
+# Server configuration
+HOST=0.0.0.0
+PORT=9854
+
+# Model configuration
+DEFAULT_MODEL=turbo
+MODEL_DOWNLOAD_ROOT=/app/models
+
+# API Keys
+KEYS_FILE=/app/keys.txt
+
+# Logging
+LOG_LEVEL=INFO
--- a/.idea/simple-asr-server.iml
+++ b/.idea/simple-asr-server.iml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/42
+++ b/42
@@ -1,22 +1,46 @@
-FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0
+# Use ROCm compatible Python image as base
+FROM rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2

+# Set working directory
 WORKDIR /app

+# Install system dependencies
 RUN apt-get update && apt-get install -y \
    ffmpeg \
+    git \
+    curl \
    python3-pip \
-    python3-venv \
+    && rm -rf /var/lib/apt/lists/*

+# Update pip
+RUN pip install --upgrade pip
+
+# Copy requirements first for better caching
 COPY requirements.txt .
-RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt

-COPY . .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt

+# Copy application code
+COPY app.py .
+
+# Create directory for models and keys
+RUN mkdir -p /app/models /app/data
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV MODEL_DOWNLOAD_ROOT=/app/models
+ENV KEYS_FILE=/app/data/keys.txt
+ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
+ENV ROCM_PATH=/opt/rocm
+
+# Expose port
 EXPOSE 9854

-# Устанавливаем переменные окружения для ROCm
-ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
-ENV PYTORCH_ROCM_ARCH=gfx1030
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:9854/health || exit 1
+
+# Run the application
+CMD ["python", "app.py"]

-# Команда для запуска приложения
-CMD ["python3", "app.py"]
--- a/README.md
+++ b/README.md
@@ -1,104 +1,86 @@
-BASED ON https://github.com/salute-developers/GigaAM
-
 # Simple ASR Server

-This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container.
+Простой сервер для автоматического распознавания речи (ASR) на базе OpenAI Whisper.

-## Prerequisites
+## Особенности

-Before you begin, ensure you have the following installed:
+- Поддержка различных моделей Whisper (tiny, base, small, medium, large, turbo)
+- Три формата вывода: plaintext, simple JSON, полный JSON
+- Параметр speedup для ускорения аудио перед распознаванием
+- Автоматическая конвертация аудио в поддерживаемый формат
+- API ключи для безопасности
+- Docker поддержка

-*   [Docker](https://docs.docker.com/get-docker/)
-*   [Docker Compose](https://docs.docker.com/compose/install/)
+## Быстрый старт

-## Project Structure
-
-```
-.
-├── app.py              # Main application file with FastAPI endpoint
-├── docker-compose.yml  # Docker Compose configuration
-├── Dockerfile          # Dockerfile for building the application image
-├── model/              # Directory for Whisper model files
-└── requirements.txt    # Python dependencies
-```
-
-## Setup
-
-1.  **Clone the repository:**
+### Локальная установка

+1. Установите зависимости:
 ```bash
-    git clone https://github.com/SlavaVlad/simple-asr-server
-    cd simple-asr-server
+pip install -r requirements.txt
 ```
-3.  **Add API keys:**
-
-    Create a `keys.txt` file in the root of the project and add your API keys, one per line.
-
-## Building and Running the Project
-
-You can build and run the project using Docker Compose.
-
-1.  **Build the Docker image:**

+2. Скопируйте и настройте переменные окружения:
 ```bash
-    docker-compose build
+cp .env.example .env
 ```

-2.  **Run the container:**
-
+3. Запустите сервер:
 ```bash
-    docker-compose up
+python app.py
 ```

-    The application will be available at `http://0.0.0.0:9854`.
+### Docker

-## API Endpoint
+1. Постройте и запустите контейнер:
+```bash
+docker-compose up --build
+```
+
+## API

 ### POST /transcribe

-This endpoint accepts an audio file and returns the transcription.
+Распознавание речи из аудиофайла.

-*   **URL:** `/transcribe`
-*   **Method:** `POST`
-*   **Headers:**
-    *   `X-API-Key`: Your API key.
-*   **Form Data:**
-    *   `file`: The audio file to be transcribed.
+**Параметры:**
+- `file` (файл) - Аудиофайл для распознавания
+- `model_name` (опционально) - Модель Whisper для использования
+- `output_format` - Формат вывода: `plaintext`, `simple`, или `json`
+- `speedup` - Коэффициент ускорения аудио (0.25-4.0)

-**Example using `curl`:**
+**Заголовки:**
+- `x-api-key` - API ключ
+
+**Примеры:**

 ```bash
-curl -X POST "http://localhost:9854/transcribe" \
-     -H "X-API-Key: YOUR_API_KEY" \
-     -F "file=@/path/to/your/audio.wav"
+# Простой текстовый вывод
+curl -X POST "http://localhost:9854/transcribe?output_format=plaintext&speedup=1.5" \
+  -H "x-api-key: YOUR_API_KEY" \
+  -F "file=@audio.wav"
+
+# JSON с только текстом
+curl -X POST "http://localhost:9854/transcribe?output_format=simple" \
+  -H "x-api-key: YOUR_API_KEY" \
+  -F "file=@audio.wav"
+
+# Полный JSON ответ с использованием другой модели
+curl -X POST "http://localhost:9854/transcribe?output_format=json&model_name=base" \
+  -H "x-api-key: YOUR_API_KEY" \
+  -F "file=@audio.wav"
 ```

-**Successful Response (200 OK):**
+### GET /health

-```json
-{
-  "transcription": [
-    {
-      "start_time": 0.0,
-      "end_time": 2.5,
-      "transcription": "Hello world."
-    }
-  ],
-  "text": "Hello world. ",
-  "metrics": {
-    "processing_time": 5.2,
-    "rtf": 0.5,
-    "word_rate": 2.0
-  }
-}
-```
+Проверка состояния сервера.

-**Error Response (401 Unauthorized):**
+## Переменные окружения

-If the API key is missing or invalid.
+См. `.env.example` для полного списка доступных переменных:

-```json
-{
-  "detail": "Invalid API Key"
-}
-```
+- `HOST` - Хост сервера (по умолчанию: 0.0.0.0)
+- `PORT` - Порт сервера (по умолчанию: 9854)
+- `DEFAULT_MODEL` - Модель по умолчанию (по умолчанию: turbo)
+- `MODEL_DOWNLOAD_ROOT` - Папка для загрузки моделей
+- `KEYS_FILE` - Файл с API ключами
--- a/app.py
+++ b/app.py
@@ -1,13 +1,14 @@
 import logging
 import os
 import subprocess
-import time
-from os import getenv
-from typing import Dict
+import tempfile
+from typing import Optional
+from enum import Enum

-import gigaam
-from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
+import whisper
+from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, Query
 from fastapi.security import APIKeyHeader
+from fastapi.responses import PlainTextResponse

 # Configure logging
 logging.basicConfig(
@@ -16,14 +17,21 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)

-app = FastAPI()
+app = FastAPI(title="Simple ASR Server", description="Audio transcription API using Whisper")

 # API key header
 api_key_header = APIKeyHeader(name="x-api-key")

+# Global model variable
+default_model = None

-def get_keys():  # не бейте меня за это
-    keys_file = "keys.txt"
+class OutputFormat(str, Enum):
+    plaintext = "plaintext"
+    simple = "simple"
+    json = "json"
+
+def get_keys():
+    keys_file = os.getenv("KEYS_FILE", "keys.txt")
    if not os.path.exists(keys_file):
        # Create a new keys file with a default key
        default_key = os.urandom(32).hex()
@@ -36,16 +44,41 @@ def get_keys():  # не бейте меня за это
        with open(keys_file, "r") as f:
            keys = [line.strip() for line in f if line.strip()]
        logger.info(f"Loaded {len(keys)} keys from file")
-        logger.debug(f"Keys: {keys}")
        if not keys:
            raise ValueError("No keys found in keys.txt")
        return keys

+def load_default_model():
+    """Load the default model on startup"""
+    global default_model
+    model_name = os.getenv("DEFAULT_MODEL", "turbo")
+    model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None)
+
+    logger.info(f"Loading default model: {model_name}")
+    try:
+        default_model = whisper.load_model(model_name, download_root=model_download_root, in_memory=True)
+        logger.info(f"Successfully loaded model: {model_name}")
+    except Exception as e:
+        logger.error(f"Failed to load default model {model_name}: {e}")
+        raise
+
+def get_model(model_name: Optional[str] = None):
+    """Get model - either default or load new one if specified"""
+    global default_model
+
+    if model_name is None:
+        return default_model
+
+    # If different model requested, load it
+    if model_name != os.getenv("DEFAULT_MODEL", "turbo"):
+        model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None)
+        logger.info(f"Loading requested model: {model_name}")
+        return whisper.load_model(model_name, download_root=model_download_root)
+
+    return default_model

 def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
-    """
-    Convert audio to compatible format and speed up if needed.
-    """
+    """Convert audio to compatible format and speed up if needed."""
    try:
        command = [
            'ffmpeg', '-i', input_path,
@@ -57,96 +90,68 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
            '-y'
        ]
        logger.debug(f"Running FFmpeg command: {' '.join(command)}")
-        subprocess.run(command, check=True, capture_output=True)
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
        return True
    except subprocess.CalledProcessError as e:
-        logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
+        logger.error(f"FFmpeg conversion failed: {e.stderr}")
        return False

-
-def get_audio_duration(file_path: str) -> float:
-    """Get audio duration using ffprobe"""
-    cmd = [
-        'ffprobe',
-        '-v', 'quiet',
-        '-show_entries', 'format=duration',
-        '-of', 'default=noprint_wrappers=1:nokey=1',
-        file_path
-    ]
-    try:
-        output = subprocess.check_output(cmd).decode().strip()
-        return float(output)
-    except:
-        return 0.0
-
-
@app.post("/transcribe")
 async def transcribe_audio(
    file: UploadFile = File(...),
    token: str = Depends(api_key_header),
-        model: str = "turbo",
-        verbose: Optional[bool] = None,
-        temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-        compression_ratio_threshold: Optional[float] = 2.4,
-        speed_up: Optional[float] = 1.25,
-        logprob_threshold: Optional[float] = -1.0,
-        no_speech_threshold: Optional[float] = 0.6,
-        condition_on_previous_text: bool = True,
-        initial_prompt: Optional[str] = None,
-        word_timestamps: bool = False,
-        prepend_punctuations: str = "\"'\"¿([{-",
-        append_punctuations: str = "\"\'.。,，!！?？:：\")]}、",
-        clip_timestamps: Union[str, List[float]] = "0",
-        hallucination_silence_threshold: Optional[float] = None
+    model_name: Optional[str] = Query(None, description="Model name to use for transcription"),
+    output_format: OutputFormat = Query(OutputFormat.json, description="Output format: plaintext, simple, or json"),
+    speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)")
 ):
+    """Transcribe audio file with configurable output format"""
+
    # Token validation
    if token not in get_keys():
        logger.warning(f"Invalid token attempt: {token}")
        raise HTTPException(status_code=403, detail="Forbidden")

-    model = whisper.load_model(model)  # Load the Whisper model
+    logger.info(f"Processing file: {file.filename}, model: {model_name or 'default'}, format: {output_format}, speedup: {speedup}")

-    logger.info(f"Processing file: {file.filename} with model: {model}")
+    # Get model
+    try:
+        model = get_model(model_name)
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}")

-    # Save uploaded file
-    temp_input_path = f"/tmp/input_{file.filename}"
-    temp_output_path = f"/tmp/converted_{file.filename}.wav"
+    # Create temporary files
+    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as temp_input:
+        temp_input_path = temp_input.name
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_output:
+        temp_output_path = temp_output.name

    try:
+        # Save uploaded file
        with open(temp_input_path, "wb") as f:
-            f.write(await file.read())
+            content = await file.read()
+            f.write(content)

-        # Convert audio if needed
-        logger.debug("Converting audio file")
-        if not convert_audio(temp_input_path, temp_output_path, speed_up):
+        # Convert audio if speedup is not 1.0 or format needs conversion
+        if speedup != 1.0 or not file.filename.lower().endswith('.wav'):
+            logger.debug(f"Converting audio file with speedup: {speedup}")
+            if not convert_audio(temp_input_path, temp_output_path, speedup):
                raise HTTPException(status_code=400, detail="Audio conversion failed")
-
-        # Get audio duration before speed up
-        original_duration = get_audio_duration(temp_input_path)
+            audio_file_path = temp_output_path
+        else:
+            audio_file_path = temp_input_path

        # Transcribe
        logger.info("Starting transcription")
-        if original_duration > 30:
-            logger.info("Audio duration > 30 seconds, using transcribe_longform")
-            transcription_result = model.transcribe_longform(
-                temp_output_path
-            )
-        else:
-            logger.info("Audio duration <= 30 seconds, using transcribe")
-            transcription_result = model.transcribe(
-                temp_output_path
-            )
-
-        full_text = ""
-        for part in transcription_result:
-            if part["transcription"].strip() != "":
-                full_text += part["transcription"].strip() + " "
-
-        result = {
-            "transcription": transcription_result,
-            "text": full_text
-        }
+        result = model.transcribe(audio_file_path)

+        # Format output based on requested format
+        if output_format == OutputFormat.plaintext:
+            return PlainTextResponse(content=result["text"], media_type="text/plain")
+        elif output_format == OutputFormat.simple:
+            return {"text": result["text"]}
+        else:  # json format
            return result

    except Exception as e:
@@ -155,16 +160,29 @@ async def transcribe_audio(

    finally:
        # Cleanup temporary files
-        if os.path.exists(temp_input_path):
-            os.remove(temp_input_path)
-        if os.path.exists(temp_output_path):
-            os.remove(temp_output_path)
+        for path in [temp_input_path, temp_output_path]:
+            if os.path.exists(path):
+                try:
+                    os.remove(path)
+                except Exception as e:
+                    logger.warning(f"Failed to remove temp file {path}: {e}")

+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "model_loaded": default_model is not None}

 def main():
    import uvicorn
+
+    # Load default model and keys
+    load_default_model()
    get_keys()
-    uvicorn.run(app, host="0.0.0.0", port=9854, log_level="debug")
+
+    port = int(os.getenv("PORT", 9854))
+    host = os.getenv("HOST", "0.0.0.0")
+
+    uvicorn.run(app, host=host, port=port, log_level="info")

 if __name__ == "__main__":
    main()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,6 +1,31 @@
 services:
-  whisper-app:
+  simple-asr-server:
    build: .
    ports:
-      - "9854:9854"
-    command: ["python", "app.py"]
+      - "${PORT:-9854}:9854"
+    environment:
+      - HOST=${HOST:-0.0.0.0}
+      - PORT=${PORT:-9854}
+      - DEFAULT_MODEL=${DEFAULT_MODEL:-turbo}
+      - MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-/app/models}
+      - KEYS_FILE=${KEYS_FILE:-/app/data/keys.txt}
+      - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-10.3.0}
+    volumes:
+      - ./models:/app/models
+      - ./data:/app/data
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    group_add:
+      - video
+      - render
+    security_opt:
+      - seccomp:unconfined
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9854/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,5 @@
 fastapi
 uvicorn[standard]
 python-multipart
-gigaam
-gigaam[longform]
-ffmpeg-python
-PyYAML
-numpy<2.0.0
+openai-whisper
+python-dotenv
--- a/simple-asr-server.service
+++ b/simple-asr-server.service
@@ -0,0 +1,20 @@
+[Unit]
+Description=Whisper ASR Server (ROCM)
+After=network.target
+Wants=network.target
+
+[Service]
+Type=exec
+User=asr
+Group=asr
+WorkingDirectory=/opt/asr
+ExecStart=/opt/asr/start_server.sh
+ExecReload=/bin/kill -HUP $MAINPID
+Restart=always
+RestartSec=10
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=asr
+
+[Install]
+WantedBy=multi-user.target
--- a/start_server.sh
+++ b/start_server.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Simple ASR Server startup script for systemd
+# This script loads environment variables from .env file and starts the server
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+APP_DIR="${SCRIPT_DIR}"
+
+# Load environment variables from .env file if it exists
+if [ -f "${APP_DIR}/.env" ]; then
+    echo "Loading environment variables from ${APP_DIR}/.env"
+    set -a  # automatically export all variables
+    source "${APP_DIR}/.env"
+    set +a
+else
+    echo "Warning: .env file not found at ${APP_DIR}/.env"
+    echo "Using default environment variables"
+fi
+
+# Set default values if not provided in .env
+export HOST=${HOST:-"0.0.0.0"}
+export PORT=${PORT:-9854}
+export DEFAULT_MODEL=${DEFAULT_MODEL:-"turbo"}
+export MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-"${APP_DIR}/models"}
+export KEYS_FILE=${KEYS_FILE:-"${APP_DIR}/keys.txt"}
+export LOG_LEVEL=${LOG_LEVEL:-"INFO"}
+
+# Create necessary directories
+mkdir -p "${MODEL_DOWNLOAD_ROOT}"
+mkdir -p "$(dirname "${KEYS_FILE}")"
+
+# Change to app directory
+cd "${APP_DIR}"
+
+echo "Starting Simple ASR Server..."
+echo "Host: ${HOST}"
+echo "Port: ${PORT}"
+echo "Default Model: ${DEFAULT_MODEL}"
+echo "Model Download Root: ${MODEL_DOWNLOAD_ROOT}"
+echo "Keys File: ${KEYS_FILE}"
+echo "Log Level: ${LOG_LEVEL}"
+
+# Start the application
+exec python3 app.py