From 5b0d04a2405d58d484d19ad2856613c60d124dfb Mon Sep 17 00:00:00 2001
From: vladislav <nauka.2.0.vs@gmail.com>
Date: Tue, 15 Jul 2025 17:30:37 +0300
Subject: [PATCH 1/5] fix: model load on request and keep in mem rename:
 model_name to model

---
 app.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/app.py b/app.py
index cb47a23..b902c31 100644
--- a/app.py
+++ b/app.py
@@ -18,7 +18,6 @@ logger = logging.getLogger(__name__)
 
 app = FastAPI()
 
-
 # API key header
 api_key_header = APIKeyHeader(name="x-api-key")
 
@@ -108,7 +107,7 @@ def get_audio_duration(file_path: str) -> float:
 async def transcribe_audio(
         file: UploadFile = File(...),
         token: str = Depends(api_key_header),
-        model_name: str = "turbo",
+        model: str = "turbo",
         verbose: Optional[bool] = None,
         temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
         compression_ratio_threshold: Optional[float] = 2.4,
@@ -127,7 +126,9 @@ async def transcribe_audio(
         logger.warning(f"Invalid token attempt: {token}")
         raise HTTPException(status_code=403, detail="Forbidden")
 
-    logger.info(f"Processing file: {file.filename} with model: {model_name}")
+    model = whisper.load_model(model)  # Load the Whisper model
+
+    logger.info(f"Processing file: {file.filename} with model: {model}")
     metrics = TranscriptionMetrics()
 
     # Save uploaded file

From 9eb026b220595aa7423fff2c97d44b580b218329 Mon Sep 17 00:00:00 2001
From: red <red@itmo.ru>
Date: Sun, 17 Aug 2025 23:24:24 +0900
Subject: [PATCH 2/5] - deleted metrics

---
 app.py | 38 ++++----------------------------------
 1 file changed, 4 insertions(+), 34 deletions(-)

diff --git a/app.py b/app.py
index b902c31..f5cf989 100644
--- a/app.py
+++ b/app.py
@@ -42,9 +42,9 @@ def get_keys():  # РЅРµ Р±РµР№С‚Рµ РјРµРЅСЏ Р·Р° СЌС‚Рѕ
         return keys
 
 
-def convert_audio(input_path: str, output_path: str, speed: float = 1.25):
+def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
     """
-    Convert audio to compatible format and speed up
+    Convert audio to compatible format and speed up if needed.
     """
     try:
         command = [
@@ -64,29 +64,6 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.25):
         return False
 
 
-class TranscriptionMetrics:
-    def __init__(self):
-        self.start_time = time.time()
-        self.end_time = None
-        self.text_length = 0
-        self.audio_duration = 0
-
-    def stop(self, text: str, audio_duration: float):
-        self.end_time = time.time()
-        self.text_length = len(text)
-        self.audio_duration = audio_duration
-
-    def get_metrics(self) -> Dict[str, float]:
-        processing_time = self.end_time - self.start_time
-        return {
-            "processing_time_seconds": round(processing_time, 2),
-            "characters_per_second": round(self.text_length / processing_time, 2),
-            "audio_realtime_ratio": round(self.audio_duration / processing_time, 2),
-            "audio_duration": round(self.audio_duration, 2),
-            "text_length": self.text_length
-        }
-
-
 def get_audio_duration(file_path: str) -> float:
     """Get audio duration using ffprobe"""
     cmd = [
@@ -111,6 +88,7 @@ async def transcribe_audio(
         verbose: Optional[bool] = None,
         temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
         compression_ratio_threshold: Optional[float] = 2.4,
+        speed_up: Optional[float] = 1.25,
         logprob_threshold: Optional[float] = -1.0,
         no_speech_threshold: Optional[float] = 0.6,
         condition_on_previous_text: bool = True,
@@ -129,7 +107,6 @@ async def transcribe_audio(
     model = whisper.load_model(model)  # Load the Whisper model
 
     logger.info(f"Processing file: {file.filename} with model: {model}")
-    metrics = TranscriptionMetrics()
 
     # Save uploaded file
     temp_input_path = f"/tmp/input_{file.filename}"
@@ -141,7 +118,7 @@ async def transcribe_audio(
 
         # Convert audio if needed
         logger.debug("Converting audio file")
-        if not convert_audio(temp_input_path, temp_output_path):
+        if not convert_audio(temp_input_path, temp_output_path, speed_up):
             raise HTTPException(status_code=400, detail="Audio conversion failed")
 
         # Get audio duration before speed up
@@ -165,13 +142,6 @@ async def transcribe_audio(
             hallucination_silence_threshold=hallucination_silence_threshold
         )
 
-        # Calculate metrics
-        metrics.stop(result["text"], original_duration)
-        logger.info(f"Transcription metrics: {metrics.get_metrics()}")
-
-        # Add metrics to result
-        result["metrics"] = metrics.get_metrics()
-
         return result
 
     except Exception as e:

From 228f67d07ffccd589da1ec8f933e7fc106fc2494 Mon Sep 17 00:00:00 2001
From: red <red@itmo.ru>
Date: Wed, 20 Aug 2025 23:18:02 +0900
Subject: [PATCH 3/5] =?UTF-8?q?-=20=D0=9F=D0=BE=D0=BC=D0=B5=D0=BD=D1=8F?=
 =?UTF-8?q?=D0=BB=20=D0=B2=D1=81=D1=91=20=D1=81=D0=BD=D0=BE=D0=B2=D0=B0=20?=
 =?UTF-8?q?=D0=BD=D0=B0=20Whisper=20-=20=D0=94=D0=BE=D0=B1=D0=B0=D0=B2?=
 =?UTF-8?q?=D0=B8=D0=BB=20=D0=BF=D1=80=D0=B5=D0=B4=D0=B7=D0=B0=D0=B3=D1=80?=
 =?UTF-8?q?=D1=83=D0=B7=D0=BA=D1=83=20=D0=BC=D0=BE=D0=B4=D0=B5=D0=BB=D0=B8?=
 =?UTF-8?q?=20=D0=BF=D0=BE-=D1=83=D0=BC=D0=BE=D0=BB=D1=87=D0=B0=D0=BD?=
 =?UTF-8?q?=D0=B8=D1=8E=20-=20=D0=A3=D0=B1=D1=80=D0=B0=D0=BB=20=D0=BC?=
 =?UTF-8?q?=D0=B5=D1=82=D1=80=D0=B8=D0=BA=D0=B8=20-=20=D0=94=D0=BE=D0=B1?=
 =?UTF-8?q?=D0=B0=D0=B2=D0=B8=D0=BB=20=D1=81=D0=BA=D1=80=D0=B8=D0=BF=D1=82?=
 =?UTF-8?q?=D1=8B=20=D0=B4=D0=BB=D1=8F=20=D1=81=D1=82=D0=B0=D1=80=D1=82?=
 =?UTF-8?q?=D0=B0=20-=20=D0=94=D0=BB=D1=8F=20=D0=BE=D1=82=D1=87=D0=B0?=
 =?UTF-8?q?=D1=8F=D0=BD=D0=BD=D1=8B=D1=85=20Dockerfile=20=D0=B4=D0=BB?=
 =?UTF-8?q?=D1=8F=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B8=20=D0=BA=D0=BE?=
 =?UTF-8?q?=D0=BD=D1=82=D0=B5=D0=B9=D0=BD=D0=B5=D1=80=D0=B0=20=D0=BD=D0=B0?=
 =?UTF-8?q?=2070=D0=93=D0=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .env.example                |  13 +++
 .idea/simple-asr-server.iml |   7 ++
 Dockerfile                  |  42 ++++++--
 README.md                   | 138 ++++++++++++--------------
 app.py                      | 190 ++++++++++++++++++++----------------
 docker-compose.yml          |  31 +++++-
 requirements.txt            |   7 +-
 simple-asr-server.service   |  20 ++++
 start_server.sh             |  47 +++++++++
 9 files changed, 314 insertions(+), 181 deletions(-)
 create mode 100644 .env.example
 create mode 100644 .idea/simple-asr-server.iml
 create mode 100644 simple-asr-server.service
 create mode 100644 start_server.sh

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..73fcce6
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,13 @@
+# Server configuration
+HOST=0.0.0.0
+PORT=9854
+
+# Model configuration
+DEFAULT_MODEL=turbo
+MODEL_DOWNLOAD_ROOT=/app/models
+
+# API Keys
+KEYS_FILE=/app/keys.txt
+
+# Logging
+LOG_LEVEL=INFO
diff --git a/.idea/simple-asr-server.iml b/.idea/simple-asr-server.iml
new file mode 100644
index 0000000..ec63674
--- /dev/null
+++ b/.idea/simple-asr-server.iml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 2f7749a..57398d0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,46 @@
-FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0
+# Use ROCm compatible Python image as base
+FROM rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2
 
+# Set working directory
 WORKDIR /app
 
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     ffmpeg \
+    git \
+    curl \
     python3-pip \
-    python3-venv \
+    && rm -rf /var/lib/apt/lists/*
 
+# Update pip
+RUN pip install --upgrade pip
+
+# Copy requirements first for better caching
 COPY requirements.txt .
-RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
 
-COPY . .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
 
+# Copy application code
+COPY app.py .
+
+# Create directory for models and keys
+RUN mkdir -p /app/models /app/data
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV MODEL_DOWNLOAD_ROOT=/app/models
+ENV KEYS_FILE=/app/data/keys.txt
+ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
+ENV ROCM_PATH=/opt/rocm
+
+# Expose port
 EXPOSE 9854
 
-# РЈСЃС‚Р°РЅР°РІР»РёРІР°РµРј РїРµСЂРµРјРµРЅРЅС‹Рµ РѕРєСЂСѓР¶РµРЅРёСЏ РґР»СЏ ROCm
-ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
-ENV PYTORCH_ROCM_ARCH=gfx1030
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:9854/health || exit 1
+
+# Run the application
+CMD ["python", "app.py"]
 
-# РљРѕРјР°РЅРґР° РґР»СЏ Р·Р°РїСѓСЃРєР° РїСЂРёР»РѕР¶РµРЅРёСЏ
-CMD ["python3", "app.py"]
diff --git a/README.md b/README.md
index 72464ed..eaec10b 100644
--- a/README.md
+++ b/README.md
@@ -1,104 +1,86 @@
-BASED ON https://github.com/salute-developers/GigaAM
-
 # Simple ASR Server
 
-This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container.
+Простой сервер для автоматического распознавания речи (ASR) на базе OpenAI Whisper.
 
-## Prerequisites
+## Особенности
 
-Before you begin, ensure you have the following installed:
+- Поддержка различных моделей Whisper (tiny, base, small, medium, large, turbo)
+- Три формата вывода: plaintext, simple JSON, полный JSON
+- Параметр speedup для ускорения аудио перед распознаванием
+- Автоматическая конвертация аудио в поддерживаемый формат
+- API ключи для безопасности
+- Docker поддержка
 
-*   [Docker](https://docs.docker.com/get-docker/)
-*   [Docker Compose](https://docs.docker.com/compose/install/)
+## Быстрый старт
 
-## Project Structure
+### Локальная установка
 
-```
-.
-в”њв”Ђв”Ђ app.py              # Main application file with FastAPI endpoint
-в”њв”Ђв”Ђ docker-compose.yml  # Docker Compose configuration
-в”њв”Ђв”Ђ Dockerfile          # Dockerfile for building the application image
-в”њв”Ђв”Ђ model/              # Directory for Whisper model files
-в””в”Ђв”Ђ requirements.txt    # Python dependencies
+1. Установите зависимости:
+```bash
+pip install -r requirements.txt
 ```
 
-## Setup
+2. Скопируйте и настройте переменные окружения:
+```bash
+cp .env.example .env
+```
 
-1.  **Clone the repository:**
+3. Запустите сервер:
+```bash
+python app.py
+```
 
-    ```bash
-    git clone https://github.com/SlavaVlad/simple-asr-server
-    cd simple-asr-server
-    ```
-3.  **Add API keys:**
+### Docker
 
-    Create a `keys.txt` file in the root of the project and add your API keys, one per line.
+1. Постройте и запустите контейнер:
+```bash
+docker-compose up --build
+```
 
-## Building and Running the Project
-
-You can build and run the project using Docker Compose.
-
-1.  **Build the Docker image:**
-
-    ```bash
-    docker-compose build
-    ```
-
-2.  **Run the container:**
-
-    ```bash
-    docker-compose up
-    ```
-
-    The application will be available at `http://0.0.0.0:9854`.
-
-## API Endpoint
+## API
 
 ### POST /transcribe
 
-This endpoint accepts an audio file and returns the transcription.
+Распознавание речи из аудиофайла.
 
-*   **URL:** `/transcribe`
-*   **Method:** `POST`
-*   **Headers:**
-    *   `X-API-Key`: Your API key.
-*   **Form Data:**
-    *   `file`: The audio file to be transcribed.
+**Параметры:**
+- `file` (файл) - Аудиофайл для распознавания
+- `model_name` (опционально) - Модель Whisper для использования
+- `output_format` - Формат вывода: `plaintext`, `simple`, или `json`
+- `speedup` - Коэффициент ускорения аудио (0.25-4.0)
 
-**Example using `curl`:**
+**Заголовки:**
+- `x-api-key` - API ключ
+
+**Примеры:**
 
 ```bash
-curl -X POST "http://localhost:9854/transcribe" \
-     -H "X-API-Key: YOUR_API_KEY" \
-     -F "file=@/path/to/your/audio.wav"
+# Простой текстовый вывод
+curl -X POST "http://localhost:9854/transcribe?output_format=plaintext&speedup=1.5" \
+  -H "x-api-key: YOUR_API_KEY" \
+  -F "file=@audio.wav"
+
+# JSON с только текстом
+curl -X POST "http://localhost:9854/transcribe?output_format=simple" \
+  -H "x-api-key: YOUR_API_KEY" \
+  -F "file=@audio.wav"
+
+# Полный JSON ответ с использованием другой модели
+curl -X POST "http://localhost:9854/transcribe?output_format=json&model_name=base" \
+  -H "x-api-key: YOUR_API_KEY" \
+  -F "file=@audio.wav"
 ```
 
-**Successful Response (200 OK):**
+### GET /health
 
-```json
-{
-  "transcription": [
-    {
-      "start_time": 0.0,
-      "end_time": 2.5,
-      "transcription": "Hello world."
-    }
-  ],
-  "text": "Hello world. ",
-  "metrics": {
-    "processing_time": 5.2,
-    "rtf": 0.5,
-    "word_rate": 2.0
-  }
-}
-```
+Проверка состояния сервера.
 
-**Error Response (401 Unauthorized):**
+## Переменные окружения
 
-If the API key is missing or invalid.
+См. `.env.example` для полного списка доступных переменных:
 
-```json
-{
-  "detail": "Invalid API Key"
-}
-```
+- `HOST` - Хост сервера (по умолчанию: 0.0.0.0)
+- `PORT` - Порт сервера (по умолчанию: 9854)
+- `DEFAULT_MODEL` - Модель по умолчанию (по умолчанию: turbo)
+- `MODEL_DOWNLOAD_ROOT` - Папка для загрузки моделей
+- `KEYS_FILE` - Файл с API ключами
diff --git a/app.py b/app.py
index cca21a8..4893492 100644
--- a/app.py
+++ b/app.py
@@ -1,13 +1,14 @@
 import logging
 import os
 import subprocess
-import time
-from os import getenv
-from typing import Dict
+import tempfile
+from typing import Optional
+from enum import Enum
 
-import gigaam
-from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
+import whisper
+from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, Query
 from fastapi.security import APIKeyHeader
+from fastapi.responses import PlainTextResponse
 
 # Configure logging
 logging.basicConfig(
@@ -16,14 +17,21 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 
-app = FastAPI()
+app = FastAPI(title="Simple ASR Server", description="Audio transcription API using Whisper")
 
 # API key header
 api_key_header = APIKeyHeader(name="x-api-key")
 
+# Global model variable
+default_model = None
 
-def get_keys():  # РЅРµ Р±РµР№С‚Рµ РјРµРЅСЏ Р·Р° СЌС‚Рѕ
-    keys_file = "keys.txt"
+class OutputFormat(str, Enum):
+    plaintext = "plaintext"
+    simple = "simple"
+    json = "json"
+
+def get_keys():
+    keys_file = os.getenv("KEYS_FILE", "keys.txt")
     if not os.path.exists(keys_file):
         # Create a new keys file with a default key
         default_key = os.urandom(32).hex()
@@ -36,16 +44,41 @@ def get_keys():  # РЅРµ Р±РµР№С‚Рµ РјРµРЅСЏ Р·Р° СЌС‚Рѕ
         with open(keys_file, "r") as f:
             keys = [line.strip() for line in f if line.strip()]
         logger.info(f"Loaded {len(keys)} keys from file")
-        logger.debug(f"Keys: {keys}")
         if not keys:
             raise ValueError("No keys found in keys.txt")
         return keys
 
+def load_default_model():
+    """Load the default model on startup"""
+    global default_model
+    model_name = os.getenv("DEFAULT_MODEL", "turbo")
+    model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None)
+
+    logger.info(f"Loading default model: {model_name}")
+    try:
+        default_model = whisper.load_model(model_name, download_root=model_download_root, in_memory=True)
+        logger.info(f"Successfully loaded model: {model_name}")
+    except Exception as e:
+        logger.error(f"Failed to load default model {model_name}: {e}")
+        raise
+
+def get_model(model_name: Optional[str] = None):
+    """Get model - either default or load new one if specified"""
+    global default_model
+
+    if model_name is None:
+        return default_model
+
+    # If different model requested, load it
+    if model_name != os.getenv("DEFAULT_MODEL", "turbo"):
+        model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None)
+        logger.info(f"Loading requested model: {model_name}")
+        return whisper.load_model(model_name, download_root=model_download_root)
+
+    return default_model
 
 def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
-    """
-    Convert audio to compatible format and speed up if needed.
-    """
+    """Convert audio to compatible format and speed up if needed."""
     try:
         command = [
             'ffmpeg', '-i', input_path,
@@ -57,97 +90,69 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
             '-y'
         ]
         logger.debug(f"Running FFmpeg command: {' '.join(command)}")
-        subprocess.run(command, check=True, capture_output=True)
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
         return True
     except subprocess.CalledProcessError as e:
-        logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
+        logger.error(f"FFmpeg conversion failed: {e.stderr}")
         return False
 
-
-def get_audio_duration(file_path: str) -> float:
-    """Get audio duration using ffprobe"""
-    cmd = [
-        'ffprobe',
-        '-v', 'quiet',
-        '-show_entries', 'format=duration',
-        '-of', 'default=noprint_wrappers=1:nokey=1',
-        file_path
-    ]
-    try:
-        output = subprocess.check_output(cmd).decode().strip()
-        return float(output)
-    except:
-        return 0.0
-
-
 @app.post("/transcribe")
 async def transcribe_audio(
-        file: UploadFile = File(...),
-        token: str = Depends(api_key_header),
-        model: str = "turbo",
-        verbose: Optional[bool] = None,
-        temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-        compression_ratio_threshold: Optional[float] = 2.4,
-        speed_up: Optional[float] = 1.25,
-        logprob_threshold: Optional[float] = -1.0,
-        no_speech_threshold: Optional[float] = 0.6,
-        condition_on_previous_text: bool = True,
-        initial_prompt: Optional[str] = None,
-        word_timestamps: bool = False,
-        prepend_punctuations: str = "\"'\"Вї([{-",
-        append_punctuations: str = "\"\'.гЂ‚,пјЊ!пјЃ?пјџ:пјљ\")]}гЂЃ",
-        clip_timestamps: Union[str, List[float]] = "0",
-        hallucination_silence_threshold: Optional[float] = None
+    file: UploadFile = File(...),
+    token: str = Depends(api_key_header),
+    model_name: Optional[str] = Query(None, description="Model name to use for transcription"),
+    output_format: OutputFormat = Query(OutputFormat.json, description="Output format: plaintext, simple, or json"),
+    speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)")
 ):
+    """Transcribe audio file with configurable output format"""
+
     # Token validation
     if token not in get_keys():
         logger.warning(f"Invalid token attempt: {token}")
         raise HTTPException(status_code=403, detail="Forbidden")
 
-    model = whisper.load_model(model)  # Load the Whisper model
+    logger.info(f"Processing file: {file.filename}, model: {model_name or 'default'}, format: {output_format}, speedup: {speedup}")
 
-    logger.info(f"Processing file: {file.filename} with model: {model}")
+    # Get model
+    try:
+        model = get_model(model_name)
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}")
 
-    # Save uploaded file
-    temp_input_path = f"/tmp/input_{file.filename}"
-    temp_output_path = f"/tmp/converted_{file.filename}.wav"
+    # Create temporary files
+    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as temp_input:
+        temp_input_path = temp_input.name
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_output:
+        temp_output_path = temp_output.name
 
     try:
+        # Save uploaded file
         with open(temp_input_path, "wb") as f:
-            f.write(await file.read())
+            content = await file.read()
+            f.write(content)
 
-        # Convert audio if needed
-        logger.debug("Converting audio file")
-        if not convert_audio(temp_input_path, temp_output_path, speed_up):
-            raise HTTPException(status_code=400, detail="Audio conversion failed")
-
-        # Get audio duration before speed up
-        original_duration = get_audio_duration(temp_input_path)
+        # Convert audio if speedup is not 1.0 or format needs conversion
+        if speedup != 1.0 or not file.filename.lower().endswith('.wav'):
+            logger.debug(f"Converting audio file with speedup: {speedup}")
+            if not convert_audio(temp_input_path, temp_output_path, speedup):
+                raise HTTPException(status_code=400, detail="Audio conversion failed")
+            audio_file_path = temp_output_path
+        else:
+            audio_file_path = temp_input_path
 
         # Transcribe
         logger.info("Starting transcription")
-        if original_duration > 30:
-            logger.info("Audio duration > 30 seconds, using transcribe_longform")
-            transcription_result = model.transcribe_longform(
-                temp_output_path
-            )
-        else:
-            logger.info("Audio duration <= 30 seconds, using transcribe")
-            transcription_result = model.transcribe(
-                temp_output_path
-            )
+        result = model.transcribe(audio_file_path)
 
-        full_text = ""
-        for part in transcription_result:
-            if part["transcription"].strip() != "":
-                full_text += part["transcription"].strip() + " "
-
-        result = {
-            "transcription": transcription_result,
-            "text": full_text
-        }
-
-        return result
+        # Format output based on requested format
+        if output_format == OutputFormat.plaintext:
+            return PlainTextResponse(content=result["text"], media_type="text/plain")
+        elif output_format == OutputFormat.simple:
+            return {"text": result["text"]}
+        else:  # json format
+            return result
 
     except Exception as e:
         logger.error(f"Transcription failed: {str(e)}")
@@ -155,16 +160,29 @@ async def transcribe_audio(
 
     finally:
         # Cleanup temporary files
-        if os.path.exists(temp_input_path):
-            os.remove(temp_input_path)
-        if os.path.exists(temp_output_path):
-            os.remove(temp_output_path)
+        for path in [temp_input_path, temp_output_path]:
+            if os.path.exists(path):
+                try:
+                    os.remove(path)
+                except Exception as e:
+                    logger.warning(f"Failed to remove temp file {path}: {e}")
 
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "model_loaded": default_model is not None}
 
 def main():
     import uvicorn
+
+    # Load default model and keys
+    load_default_model()
     get_keys()
-    uvicorn.run(app, host="0.0.0.0", port=9854, log_level="debug")
+
+    port = int(os.getenv("PORT", 9854))
+    host = os.getenv("HOST", "0.0.0.0")
+
+    uvicorn.run(app, host=host, port=port, log_level="info")
 
 if __name__ == "__main__":
     main()
diff --git a/docker-compose.yml b/docker-compose.yml
index cbc6f67..c6d6ae7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,6 +1,31 @@
 services:
-  whisper-app:
+  simple-asr-server:
     build: .
     ports:
-      - "9854:9854"
-    command: ["python", "app.py"]
+      - "${PORT:-9854}:9854"
+    environment:
+      - HOST=${HOST:-0.0.0.0}
+      - PORT=${PORT:-9854}
+      - DEFAULT_MODEL=${DEFAULT_MODEL:-turbo}
+      - MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-/app/models}
+      - KEYS_FILE=${KEYS_FILE:-/app/data/keys.txt}
+      - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-10.3.0}
+    volumes:
+      - ./models:/app/models
+      - ./data:/app/data
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    group_add:
+      - video
+      - render
+    security_opt:
+      - seccomp:unconfined
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9854/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
diff --git a/requirements.txt b/requirements.txt
index 6a0fedf..217c4d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,5 @@
 fastapi
 uvicorn[standard]
 python-multipart
-gigaam
-gigaam[longform]
-ffmpeg-python
-PyYAML
-numpy<2.0.0
\ No newline at end of file
+openai-whisper
+python-dotenv
diff --git a/simple-asr-server.service b/simple-asr-server.service
new file mode 100644
index 0000000..9fa30a4
--- /dev/null
+++ b/simple-asr-server.service
@@ -0,0 +1,20 @@
+[Unit]
+Description=Whisper ASR Server (ROCM)
+After=network.target
+Wants=network.target
+
+[Service]
+Type=exec
+User=asr
+Group=asr
+WorkingDirectory=/opt/asr
+ExecStart=/opt/asr/start_server.sh
+ExecReload=/bin/kill -HUP $MAINPID
+Restart=always
+RestartSec=10
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=asr
+
+[Install]
+WantedBy=multi-user.target
diff --git a/start_server.sh b/start_server.sh
new file mode 100644
index 0000000..f354e2a
--- /dev/null
+++ b/start_server.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Simple ASR Server startup script for systemd
+# This script loads environment variables from .env file and starts the server
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+APP_DIR="${SCRIPT_DIR}"
+
+# Load environment variables from .env file if it exists
+if [ -f "${APP_DIR}/.env" ]; then
+    echo "Loading environment variables from ${APP_DIR}/.env"
+    set -a  # automatically export all variables
+    source "${APP_DIR}/.env"
+    set +a
+else
+    echo "Warning: .env file not found at ${APP_DIR}/.env"
+    echo "Using default environment variables"
+fi
+
+# Set default values if not provided in .env
+export HOST=${HOST:-"0.0.0.0"}
+export PORT=${PORT:-9854}
+export DEFAULT_MODEL=${DEFAULT_MODEL:-"turbo"}
+export MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-"${APP_DIR}/models"}
+export KEYS_FILE=${KEYS_FILE:-"${APP_DIR}/keys.txt"}
+export LOG_LEVEL=${LOG_LEVEL:-"INFO"}
+
+# Create necessary directories
+mkdir -p "${MODEL_DOWNLOAD_ROOT}"
+mkdir -p "$(dirname "${KEYS_FILE}")"
+
+# Change to app directory
+cd "${APP_DIR}"
+
+echo "Starting Simple ASR Server..."
+echo "Host: ${HOST}"
+echo "Port: ${PORT}"
+echo "Default Model: ${DEFAULT_MODEL}"
+echo "Model Download Root: ${MODEL_DOWNLOAD_ROOT}"
+echo "Keys File: ${KEYS_FILE}"
+echo "Log Level: ${LOG_LEVEL}"
+
+# Start the application
+exec python3 app.py

From ce41cf4a09f976701096e47911885d0355981bb2 Mon Sep 17 00:00:00 2001
From: red <red@itmo.ru>
Date: Wed, 20 Aug 2025 23:25:05 +0900
Subject: [PATCH 4/5] =?UTF-8?q?-=20=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=B5=D0=BD=D1=8B=20=D0=BF=D0=B0=D1=80=D0=B0=D0=BC=D0=B5=D1=82?=
 =?UTF-8?q?=D1=80=D1=8B=20=D0=BC=D0=BE=D0=B4=D0=B5=D0=BB=D0=B8=20=D0=B2=20?=
 =?UTF-8?q?=D0=B3=D0=B5=D1=82=20=D1=8D=D0=BD=D0=B4=D0=BF=D0=BE=D0=B8=D0=BD?=
 =?UTF-8?q?=D1=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/app.py b/app.py
index 4893492..54a955c 100644
--- a/app.py
+++ b/app.py
@@ -2,7 +2,7 @@ import logging
 import os
 import subprocess
 import tempfile
-from typing import Optional
+from typing import Optional, Union, List, Tuple
 from enum import Enum
 
 import whisper
@@ -102,9 +102,23 @@ async def transcribe_audio(
     token: str = Depends(api_key_header),
     model_name: Optional[str] = Query(None, description="Model name to use for transcription"),
     output_format: OutputFormat = Query(OutputFormat.json, description="Output format: plaintext, simple, or json"),
-    speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)")
+    speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)"),
+    # Whisper model parameters
+    verbose: Optional[bool] = Query(None, description="Whether to print out the progress and debug messages"),
+    temperature: Union[float, str] = Query("0.0,0.2,0.4,0.6,0.8,1.0", description="Temperature for sampling (single float or comma-separated values)"),
+    compression_ratio_threshold: Optional[float] = Query(2.4, description="If the gzip compression ratio is above this value, treat as failed"),
+    logprob_threshold: Optional[float] = Query(-1.0, description="If the average log probability over sampled tokens is below this value, treat as failed"),
+    no_speech_threshold: Optional[float] = Query(0.6, description="If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below logprob_threshold, consider the segment as silent"),
+    condition_on_previous_text: bool = Query(True, description="If True, the previous output of the model is provided as a prompt for the next window"),
+    initial_prompt: Optional[str] = Query(None, description="Optional text to provide as a prompt for the first window"),
+    carry_initial_prompt: bool = Query(False, description="If True, the initial prompt is carried over to the next window"),
+    word_timestamps: bool = Query(False, description="Extract word-level timestamps using the cross-attention pattern and dynamic time warping"),
+    prepend_punctuations: str = Query("\"'([{-", description="If word_timestamps is True, merge these punctuation marks with the next word"),
+    append_punctuations: str = Query("\"'.,:;!?)]}", description="If word_timestamps is True, merge these punctuation marks with the previous word"),
+    clip_timestamps: Union[str, List[float]] = Query("0", description="Comma-separated list of clip timestamps to use for transcription"),
+    hallucination_silence_threshold: Optional[float] = Query(None, description="When word_timestamps is True, skip silent periods longer than this threshold (in seconds)"),
 ):
-    """Transcribe audio file with configurable output format"""
+    """Transcribe audio file with configurable output format and comprehensive Whisper parameters"""
 
     # Token validation
     if token not in get_keys():
@@ -142,9 +156,59 @@ async def transcribe_audio(
         else:
             audio_file_path = temp_input_path
 
+        # Prepare transcription parameters
+        transcribe_params = {}
+
+        # Handle temperature parameter (can be single value or tuple)
+        if isinstance(temperature, str) and "," in temperature:
+            try:
+                temp_values = [float(x.strip()) for x in temperature.split(",")]
+                transcribe_params["temperature"] = tuple(temp_values)
+            except ValueError:
+                transcribe_params["temperature"] = 0.0
+        else:
+            try:
+                transcribe_params["temperature"] = float(temperature)
+            except (ValueError, TypeError):
+                transcribe_params["temperature"] = 0.0
+
+        # Handle clip_timestamps parameter
+        if isinstance(clip_timestamps, str) and clip_timestamps != "0":
+            try:
+                if "," in clip_timestamps:
+                    transcribe_params["clip_timestamps"] = [float(x.strip()) for x in clip_timestamps.split(",")]
+                else:
+                    transcribe_params["clip_timestamps"] = clip_timestamps
+            except ValueError:
+                transcribe_params["clip_timestamps"] = "0"
+        else:
+            transcribe_params["clip_timestamps"] = clip_timestamps
+
+        # Add other parameters if they are not None
+        if verbose is not None:
+            transcribe_params["verbose"] = verbose
+        if compression_ratio_threshold is not None:
+            transcribe_params["compression_ratio_threshold"] = compression_ratio_threshold
+        if logprob_threshold is not None:
+            transcribe_params["logprob_threshold"] = logprob_threshold
+        if no_speech_threshold is not None:
+            transcribe_params["no_speech_threshold"] = no_speech_threshold
+
+        transcribe_params["condition_on_previous_text"] = condition_on_previous_text
+        transcribe_params["carry_initial_prompt"] = carry_initial_prompt
+        transcribe_params["word_timestamps"] = word_timestamps
+        transcribe_params["prepend_punctuations"] = prepend_punctuations
+        transcribe_params["append_punctuations"] = append_punctuations
+
+        if initial_prompt is not None:
+            transcribe_params["initial_prompt"] = initial_prompt
+        if hallucination_silence_threshold is not None:
+            transcribe_params["hallucination_silence_threshold"] = hallucination_silence_threshold
+
         # Transcribe
         logger.info("Starting transcription")
-        result = model.transcribe(audio_file_path)
+        logger.debug(f"Transcription parameters: {transcribe_params}")
+        result = model.transcribe(audio_file_path, **transcribe_params)
 
         # Format output based on requested format
         if output_format == OutputFormat.plaintext:
@@ -170,7 +234,7 @@ async def transcribe_audio(
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
-    return {"status": "healthy", "model_loaded": default_model is not None}
+    return {"status": "healthy", "model_loaded": default_model is not None, "model_name": default_model.__str__()}
 
 def main():
     import uvicorn

From d70f2e7089e29d6b3c4888e78adca54927241db0 Mon Sep 17 00:00:00 2001
From: red <red@itmo.ru>
Date: Wed, 3 Sep 2025 10:50:44 +0300
Subject: [PATCH 5/5] =?UTF-8?q?=D0=9D=D0=B0=D0=B1=D0=BE=D1=80=20=D0=B2?=
 =?UTF-8?q?=D1=81=D1=8F=D0=BA=D0=B8=D1=85=20=D1=88=D1=82=D1=83=D0=BA=20?=
 =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=B4=D0=B5=D0=BF=D0=BB=D0=BE=D1=8F.=20?=
 =?UTF-8?q?=D0=9F=D1=80=D0=B5=D0=B4=D0=BF=D0=BE=D0=BB=D0=B0=D0=B3=D0=B0?=
 =?UTF-8?q?=D0=B5=D1=82=D1=81=D1=8F=20=D1=87=D1=82=D0=BE=20ROCM=20=D1=81?=
 =?UTF-8?q?=D1=82=D0=BE=D0=B8=D1=82=20=D0=BD=D0=B0=20=D1=85=D0=BE=D1=81?=
 =?UTF-8?q?=D1=82=D0=B5!?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .env.example                                  | 19 ++++++++++++++-----
 .../inspectionProfiles/profiles_settings.xml  |  6 ++++++
 .idea/material_theme_project_new.xml          | 12 ++++++++++++
 .idea/misc.xml                                |  7 +++++++
 .idea/vcs.xml                                 |  6 ++++++
 Dockerfile                                    |  7 ++-----
 requirements.txt                              |  2 ++
 start_server.sh                               | 17 +++++++++++++++++
 8 files changed, 66 insertions(+), 10 deletions(-)
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/material_theme_project_new.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/vcs.xml
 mode change 100644 => 100755 start_server.sh

diff --git a/.env.example b/.env.example
index 73fcce6..779f616 100644
--- a/.env.example
+++ b/.env.example
@@ -1,13 +1,22 @@
+
 # Server configuration
 HOST=0.0.0.0
 PORT=9854
 
 # Model configuration
-DEFAULT_MODEL=turbo
-MODEL_DOWNLOAD_ROOT=/app/models
+DEFAULT_MODEL=tiny
+MODEL_DOWNLOAD_ROOT=./models
 
-# API Keys
-KEYS_FILE=/app/keys.txt
+# Security configuration
+KEYS_FILE=keys.txt
 
-# Logging
+# Logging configuration (optional)
 LOG_LEVEL=INFO
+
+# ROCm GPU configuration
+HSA_OVERRIDE_GFX_VERSION=10.3.0
+ROCM_PATH=/opt/rocm
+
+# Example of available Whisper models:
+# tiny, base, small, medium, large, turbo
+# turbo is recommended for best speed/quality balance
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/material_theme_project_new.xml b/.idea/material_theme_project_new.xml
new file mode 100644
index 0000000..a8dd540
--- /dev/null
+++ b/.idea/material_theme_project_new.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MaterialThemeProjectNewConfig">
+    <option name="metadata">
+      <MTProjectMetadataState>
+        <option name="migrated" value="true" />
+        <option name="pristineConfig" value="false" />
+        <option name="userId" value="-6077f146:198b84bb7ea:-7ffe" />
+      </MTProjectMetadataState>
+    </option>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..b79d30e
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (simple-asr-server)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (simple-asr-server)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 57398d0..9b71450 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
-# Use ROCm compatible Python image as base
-FROM rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2
+# Use official Python image as base
+FROM python:3.10-slim
 
 # Set working directory
 WORKDIR /app
@@ -31,8 +31,6 @@ RUN mkdir -p /app/models /app/data
 ENV PYTHONUNBUFFERED=1
 ENV MODEL_DOWNLOAD_ROOT=/app/models
 ENV KEYS_FILE=/app/data/keys.txt
-ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
-ENV ROCM_PATH=/opt/rocm
 
 # Expose port
 EXPOSE 9854
@@ -43,4 +41,3 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
 
 # Run the application
 CMD ["python", "app.py"]
-
diff --git a/requirements.txt b/requirements.txt
index 217c4d8..c3f060d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ uvicorn[standard]
 python-multipart
 openai-whisper
 python-dotenv
+
+
diff --git a/start_server.sh b/start_server.sh
old mode 100644
new mode 100755
index f354e2a..2ac7bf9
--- a/start_server.sh
+++ b/start_server.sh
@@ -32,6 +32,22 @@ export LOG_LEVEL=${LOG_LEVEL:-"INFO"}
 mkdir -p "${MODEL_DOWNLOAD_ROOT}"
 mkdir -p "$(dirname "${KEYS_FILE}")"
 
+# Check if virtual environment exists, create if not
+VENV_DIR="${APP_DIR}/venv"
+if [ ! -d "${VENV_DIR}" ]; then
+    echo "Creating virtual environment..."
+    python3 -m venv "${VENV_DIR}"
+fi
+
+# Activate virtual environment
+echo "Activating virtual environment..."
+source "${VENV_DIR}/bin/activate"
+
+# Install/upgrade dependencies
+echo "Installing/upgrading dependencies..."
+pip install --upgrade pip
+pip install -r "${APP_DIR}/requirements.txt"
+
 # Change to app directory
 cd "${APP_DIR}"
 
@@ -45,3 +61,4 @@ echo "Log Level: ${LOG_LEVEL}"
 
 # Start the application
 exec python3 app.py
+