From 5b0d04a2405d58d484d19ad2856613c60d124dfb Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 17:30:37 +0300 Subject: [PATCH 1/5] fix: model load on request and keep in mem rename: model_name to model --- app.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index cb47a23..b902c31 100644 --- a/app.py +++ b/app.py @@ -18,7 +18,6 @@ logger = logging.getLogger(__name__) app = FastAPI() - # API key header api_key_header = APIKeyHeader(name="x-api-key") @@ -108,7 +107,7 @@ def get_audio_duration(file_path: str) -> float: async def transcribe_audio( file: UploadFile = File(...), token: str = Depends(api_key_header), - model_name: str = "turbo", + model: str = "turbo", verbose: Optional[bool] = None, temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), compression_ratio_threshold: Optional[float] = 2.4, @@ -127,7 +126,9 @@ async def transcribe_audio( logger.warning(f"Invalid token attempt: {token}") raise HTTPException(status_code=403, detail="Forbidden") - logger.info(f"Processing file: {file.filename} with model: {model_name}") + model = whisper.load_model(model) # Load the Whisper model + + logger.info(f"Processing file: {file.filename} with model: {model}") metrics = TranscriptionMetrics() # Save uploaded file From 9eb026b220595aa7423fff2c97d44b580b218329 Mon Sep 17 00:00:00 2001 From: red Date: Sun, 17 Aug 2025 23:24:24 +0900 Subject: [PATCH 2/5] - deleted metrics --- app.py | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/app.py b/app.py index b902c31..f5cf989 100644 --- a/app.py +++ b/app.py @@ -42,9 +42,9 @@ def get_keys(): # не бейте меня за это return keys -def convert_audio(input_path: str, output_path: str, speed: float = 1.25): +def convert_audio(input_path: str, output_path: str, speed: float = 1.0): """ - Convert audio to compatible format and speed up + Convert audio to compatible format and speed up if needed. """ try: command = [ @@ -64,29 +64,6 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.25): return False -class TranscriptionMetrics: - def __init__(self): - self.start_time = time.time() - self.end_time = None - self.text_length = 0 - self.audio_duration = 0 - - def stop(self, text: str, audio_duration: float): - self.end_time = time.time() - self.text_length = len(text) - self.audio_duration = audio_duration - - def get_metrics(self) -> Dict[str, float]: - processing_time = self.end_time - self.start_time - return { - "processing_time_seconds": round(processing_time, 2), - "characters_per_second": round(self.text_length / processing_time, 2), - "audio_realtime_ratio": round(self.audio_duration / processing_time, 2), - "audio_duration": round(self.audio_duration, 2), - "text_length": self.text_length - } - - def get_audio_duration(file_path: str) -> float: """Get audio duration using ffprobe""" cmd = [ @@ -111,6 +88,7 @@ async def transcribe_audio( verbose: Optional[bool] = None, temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), compression_ratio_threshold: Optional[float] = 2.4, + speed_up: Optional[float] = 1.25, logprob_threshold: Optional[float] = -1.0, no_speech_threshold: Optional[float] = 0.6, condition_on_previous_text: bool = True, @@ -129,7 +107,6 @@ async def transcribe_audio( model = whisper.load_model(model) # Load the Whisper model logger.info(f"Processing file: {file.filename} with model: {model}") - metrics = TranscriptionMetrics() # Save uploaded file temp_input_path = f"/tmp/input_{file.filename}" @@ -141,7 +118,7 @@ async def transcribe_audio( # Convert audio if needed logger.debug("Converting audio file") - if not convert_audio(temp_input_path, temp_output_path): + if not convert_audio(temp_input_path, temp_output_path, speed_up): raise HTTPException(status_code=400, detail="Audio conversion failed") # Get audio duration before speed up @@ -165,13 +142,6 @@ async def transcribe_audio( hallucination_silence_threshold=hallucination_silence_threshold ) - # Calculate metrics - metrics.stop(result["text"], original_duration) - logger.info(f"Transcription metrics: {metrics.get_metrics()}") - - # Add metrics to result - result["metrics"] = metrics.get_metrics() - return result except Exception as e: From 228f67d07ffccd589da1ec8f933e7fc106fc2494 Mon Sep 17 00:00:00 2001 From: red Date: Wed, 20 Aug 2025 23:18:02 +0900 Subject: [PATCH 3/5] =?UTF-8?q?-=20=D0=9F=D0=BE=D0=BC=D0=B5=D0=BD=D1=8F?= =?UTF-8?q?=D0=BB=20=D0=B2=D1=81=D1=91=20=D1=81=D0=BD=D0=BE=D0=B2=D0=B0=20?= =?UTF-8?q?=D0=BD=D0=B0=20Whisper=20-=20=D0=94=D0=BE=D0=B1=D0=B0=D0=B2?= =?UTF-8?q?=D0=B8=D0=BB=20=D0=BF=D1=80=D0=B5=D0=B4=D0=B7=D0=B0=D0=B3=D1=80?= =?UTF-8?q?=D1=83=D0=B7=D0=BA=D1=83=20=D0=BC=D0=BE=D0=B4=D0=B5=D0=BB=D0=B8?= =?UTF-8?q?=20=D0=BF=D0=BE-=D1=83=D0=BC=D0=BE=D0=BB=D1=87=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=8E=20-=20=D0=A3=D0=B1=D1=80=D0=B0=D0=BB=20=D0=BC?= =?UTF-8?q?=D0=B5=D1=82=D1=80=D0=B8=D0=BA=D0=B8=20-=20=D0=94=D0=BE=D0=B1?= =?UTF-8?q?=D0=B0=D0=B2=D0=B8=D0=BB=20=D1=81=D0=BA=D1=80=D0=B8=D0=BF=D1=82?= =?UTF-8?q?=D1=8B=20=D0=B4=D0=BB=D1=8F=20=D1=81=D1=82=D0=B0=D1=80=D1=82?= =?UTF-8?q?=D0=B0=20-=20=D0=94=D0=BB=D1=8F=20=D0=BE=D1=82=D1=87=D0=B0?= =?UTF-8?q?=D1=8F=D0=BD=D0=BD=D1=8B=D1=85=20Dockerfile=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D1=81=D0=B1=D0=BE=D1=80=D0=BA=D0=B8=20=D0=BA=D0=BE?= =?UTF-8?q?=D0=BD=D1=82=D0=B5=D0=B9=D0=BD=D0=B5=D1=80=D0=B0=20=D0=BD=D0=B0?= =?UTF-8?q?=2070=D0=93=D0=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 13 +++ .idea/simple-asr-server.iml | 7 ++ Dockerfile | 42 ++++++-- README.md | 138 ++++++++++++-------------- app.py | 190 ++++++++++++++++++++---------------- docker-compose.yml | 31 +++++- requirements.txt | 7 +- simple-asr-server.service | 20 ++++ start_server.sh | 47 +++++++++ 9 files changed, 314 insertions(+), 181 deletions(-) create mode 100644 .env.example create mode 100644 .idea/simple-asr-server.iml create mode 100644 simple-asr-server.service create mode 100644 start_server.sh diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..73fcce6 --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +# Server configuration +HOST=0.0.0.0 +PORT=9854 + +# Model configuration +DEFAULT_MODEL=turbo +MODEL_DOWNLOAD_ROOT=/app/models + +# API Keys +KEYS_FILE=/app/keys.txt + +# Logging +LOG_LEVEL=INFO diff --git a/.idea/simple-asr-server.iml b/.idea/simple-asr-server.iml new file mode 100644 index 0000000..ec63674 --- /dev/null +++ b/.idea/simple-asr-server.iml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 2f7749a..57398d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,46 @@ -FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0 +# Use ROCm compatible Python image as base +FROM rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2 +# Set working directory WORKDIR /app +# Install system dependencies RUN apt-get update && apt-get install -y \ ffmpeg \ + git \ + curl \ python3-pip \ - python3-venv \ + && rm -rf /var/lib/apt/lists/* +# Update pip +RUN pip install --upgrade pip + +# Copy requirements first for better caching COPY requirements.txt . -RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt -COPY . . +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt +# Copy application code +COPY app.py . + +# Create directory for models and keys +RUN mkdir -p /app/models /app/data + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV MODEL_DOWNLOAD_ROOT=/app/models +ENV KEYS_FILE=/app/data/keys.txt +ENV HSA_OVERRIDE_GFX_VERSION=10.3.0 +ENV ROCM_PATH=/opt/rocm + +# Expose port EXPOSE 9854 -# Устанавливаем переменные окружения для ROCm -ENV HSA_OVERRIDE_GFX_VERSION=10.3.0 -ENV PYTORCH_ROCM_ARCH=gfx1030 +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:9854/health || exit 1 + +# Run the application +CMD ["python", "app.py"] -# Команда для запуска приложения -CMD ["python3", "app.py"] diff --git a/README.md b/README.md index 72464ed..eaec10b 100644 --- a/README.md +++ b/README.md @@ -1,104 +1,86 @@ -BASED ON https://github.com/salute-developers/GigaAM - # Simple ASR Server -This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container. + (ASR) OpenAI Whisper. -## Prerequisites +## -Before you begin, ensure you have the following installed: +- Whisper (tiny, base, small, medium, large, turbo) +- : plaintext, simple JSON, JSON +- speedup +- +- API +- Docker -* [Docker](https://docs.docker.com/get-docker/) -* [Docker Compose](https://docs.docker.com/compose/install/) +## -## Project Structure +### -``` -. -├── app.py # Main application file with FastAPI endpoint -├── docker-compose.yml # Docker Compose configuration -├── Dockerfile # Dockerfile for building the application image -├── model/ # Directory for Whisper model files -└── requirements.txt # Python dependencies +1. : +```bash +pip install -r requirements.txt ``` -## Setup +2. : +```bash +cp .env.example .env +``` -1. **Clone the repository:** +3. : +```bash +python app.py +``` - ```bash - git clone https://github.com/SlavaVlad/simple-asr-server - cd simple-asr-server - ``` -3. **Add API keys:** +### Docker - Create a `keys.txt` file in the root of the project and add your API keys, one per line. +1. : +```bash +docker-compose up --build +``` -## Building and Running the Project - -You can build and run the project using Docker Compose. - -1. **Build the Docker image:** - - ```bash - docker-compose build - ``` - -2. **Run the container:** - - ```bash - docker-compose up - ``` - - The application will be available at `http://0.0.0.0:9854`. - -## API Endpoint +## API ### POST /transcribe -This endpoint accepts an audio file and returns the transcription. + . -* **URL:** `/transcribe` -* **Method:** `POST` -* **Headers:** - * `X-API-Key`: Your API key. -* **Form Data:** - * `file`: The audio file to be transcribed. +**:** +- `file` () - +- `model_name` () - Whisper +- `output_format` - : `plaintext`, `simple`, `json` +- `speedup` - (0.25-4.0) -**Example using `curl`:** +**:** +- `x-api-key` - API + +**:** ```bash -curl -X POST "http://localhost:9854/transcribe" \ - -H "X-API-Key: YOUR_API_KEY" \ - -F "file=@/path/to/your/audio.wav" +# +curl -X POST "http://localhost:9854/transcribe?output_format=plaintext&speedup=1.5" \ + -H "x-api-key: YOUR_API_KEY" \ + -F "file=@audio.wav" + +# JSON +curl -X POST "http://localhost:9854/transcribe?output_format=simple" \ + -H "x-api-key: YOUR_API_KEY" \ + -F "file=@audio.wav" + +# JSON +curl -X POST "http://localhost:9854/transcribe?output_format=json&model_name=base" \ + -H "x-api-key: YOUR_API_KEY" \ + -F "file=@audio.wav" ``` -**Successful Response (200 OK):** +### GET /health -```json -{ - "transcription": [ - { - "start_time": 0.0, - "end_time": 2.5, - "transcription": "Hello world." - } - ], - "text": "Hello world. ", - "metrics": { - "processing_time": 5.2, - "rtf": 0.5, - "word_rate": 2.0 - } -} -``` + . -**Error Response (401 Unauthorized):** +## -If the API key is missing or invalid. +. `.env.example` : -```json -{ - "detail": "Invalid API Key" -} -``` +- `HOST` - ( : 0.0.0.0) +- `PORT` - ( : 9854) +- `DEFAULT_MODEL` - ( : turbo) +- `MODEL_DOWNLOAD_ROOT` - +- `KEYS_FILE` - API diff --git a/app.py b/app.py index cca21a8..4893492 100644 --- a/app.py +++ b/app.py @@ -1,13 +1,14 @@ import logging import os import subprocess -import time -from os import getenv -from typing import Dict +import tempfile +from typing import Optional +from enum import Enum -import gigaam -from fastapi import FastAPI, Depends, HTTPException, UploadFile, File +import whisper +from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, Query from fastapi.security import APIKeyHeader +from fastapi.responses import PlainTextResponse # Configure logging logging.basicConfig( @@ -16,14 +17,21 @@ logging.basicConfig( ) logger = logging.getLogger(__name__) -app = FastAPI() +app = FastAPI(title="Simple ASR Server", description="Audio transcription API using Whisper") # API key header api_key_header = APIKeyHeader(name="x-api-key") +# Global model variable +default_model = None -def get_keys(): # не бейте меня за это - keys_file = "keys.txt" +class OutputFormat(str, Enum): + plaintext = "plaintext" + simple = "simple" + json = "json" + +def get_keys(): + keys_file = os.getenv("KEYS_FILE", "keys.txt") if not os.path.exists(keys_file): # Create a new keys file with a default key default_key = os.urandom(32).hex() @@ -36,16 +44,41 @@ def get_keys(): # не бейте меня за это with open(keys_file, "r") as f: keys = [line.strip() for line in f if line.strip()] logger.info(f"Loaded {len(keys)} keys from file") - logger.debug(f"Keys: {keys}") if not keys: raise ValueError("No keys found in keys.txt") return keys +def load_default_model(): + """Load the default model on startup""" + global default_model + model_name = os.getenv("DEFAULT_MODEL", "turbo") + model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None) + + logger.info(f"Loading default model: {model_name}") + try: + default_model = whisper.load_model(model_name, download_root=model_download_root, in_memory=True) + logger.info(f"Successfully loaded model: {model_name}") + except Exception as e: + logger.error(f"Failed to load default model {model_name}: {e}") + raise + +def get_model(model_name: Optional[str] = None): + """Get model - either default or load new one if specified""" + global default_model + + if model_name is None: + return default_model + + # If different model requested, load it + if model_name != os.getenv("DEFAULT_MODEL", "turbo"): + model_download_root = os.getenv("MODEL_DOWNLOAD_ROOT", None) + logger.info(f"Loading requested model: {model_name}") + return whisper.load_model(model_name, download_root=model_download_root) + + return default_model def convert_audio(input_path: str, output_path: str, speed: float = 1.0): - """ - Convert audio to compatible format and speed up if needed. - """ + """Convert audio to compatible format and speed up if needed.""" try: command = [ 'ffmpeg', '-i', input_path, @@ -57,97 +90,69 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.0): '-y' ] logger.debug(f"Running FFmpeg command: {' '.join(command)}") - subprocess.run(command, check=True, capture_output=True) + result = subprocess.run(command, check=True, capture_output=True, text=True) return True except subprocess.CalledProcessError as e: - logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}") + logger.error(f"FFmpeg conversion failed: {e.stderr}") return False - -def get_audio_duration(file_path: str) -> float: - """Get audio duration using ffprobe""" - cmd = [ - 'ffprobe', - '-v', 'quiet', - '-show_entries', 'format=duration', - '-of', 'default=noprint_wrappers=1:nokey=1', - file_path - ] - try: - output = subprocess.check_output(cmd).decode().strip() - return float(output) - except: - return 0.0 - - @app.post("/transcribe") async def transcribe_audio( - file: UploadFile = File(...), - token: str = Depends(api_key_header), - model: str = "turbo", - verbose: Optional[bool] = None, - temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), - compression_ratio_threshold: Optional[float] = 2.4, - speed_up: Optional[float] = 1.25, - logprob_threshold: Optional[float] = -1.0, - no_speech_threshold: Optional[float] = 0.6, - condition_on_previous_text: bool = True, - initial_prompt: Optional[str] = None, - word_timestamps: bool = False, - prepend_punctuations: str = "\"'\"¿([{-", - append_punctuations: str = "\"\'.。,,!!??::\")]}、", - clip_timestamps: Union[str, List[float]] = "0", - hallucination_silence_threshold: Optional[float] = None + file: UploadFile = File(...), + token: str = Depends(api_key_header), + model_name: Optional[str] = Query(None, description="Model name to use for transcription"), + output_format: OutputFormat = Query(OutputFormat.json, description="Output format: plaintext, simple, or json"), + speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)") ): + """Transcribe audio file with configurable output format""" + # Token validation if token not in get_keys(): logger.warning(f"Invalid token attempt: {token}") raise HTTPException(status_code=403, detail="Forbidden") - model = whisper.load_model(model) # Load the Whisper model + logger.info(f"Processing file: {file.filename}, model: {model_name or 'default'}, format: {output_format}, speedup: {speedup}") - logger.info(f"Processing file: {file.filename} with model: {model}") + # Get model + try: + model = get_model(model_name) + except Exception as e: + logger.error(f"Failed to load model: {e}") + raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}") - # Save uploaded file - temp_input_path = f"/tmp/input_{file.filename}" - temp_output_path = f"/tmp/converted_{file.filename}.wav" + # Create temporary files + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as temp_input: + temp_input_path = temp_input.name + + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_output: + temp_output_path = temp_output.name try: + # Save uploaded file with open(temp_input_path, "wb") as f: - f.write(await file.read()) + content = await file.read() + f.write(content) - # Convert audio if needed - logger.debug("Converting audio file") - if not convert_audio(temp_input_path, temp_output_path, speed_up): - raise HTTPException(status_code=400, detail="Audio conversion failed") - - # Get audio duration before speed up - original_duration = get_audio_duration(temp_input_path) + # Convert audio if speedup is not 1.0 or format needs conversion + if speedup != 1.0 or not file.filename.lower().endswith('.wav'): + logger.debug(f"Converting audio file with speedup: {speedup}") + if not convert_audio(temp_input_path, temp_output_path, speedup): + raise HTTPException(status_code=400, detail="Audio conversion failed") + audio_file_path = temp_output_path + else: + audio_file_path = temp_input_path # Transcribe logger.info("Starting transcription") - if original_duration > 30: - logger.info("Audio duration > 30 seconds, using transcribe_longform") - transcription_result = model.transcribe_longform( - temp_output_path - ) - else: - logger.info("Audio duration <= 30 seconds, using transcribe") - transcription_result = model.transcribe( - temp_output_path - ) + result = model.transcribe(audio_file_path) - full_text = "" - for part in transcription_result: - if part["transcription"].strip() != "": - full_text += part["transcription"].strip() + " " - - result = { - "transcription": transcription_result, - "text": full_text - } - - return result + # Format output based on requested format + if output_format == OutputFormat.plaintext: + return PlainTextResponse(content=result["text"], media_type="text/plain") + elif output_format == OutputFormat.simple: + return {"text": result["text"]} + else: # json format + return result except Exception as e: logger.error(f"Transcription failed: {str(e)}") @@ -155,16 +160,29 @@ async def transcribe_audio( finally: # Cleanup temporary files - if os.path.exists(temp_input_path): - os.remove(temp_input_path) - if os.path.exists(temp_output_path): - os.remove(temp_output_path) + for path in [temp_input_path, temp_output_path]: + if os.path.exists(path): + try: + os.remove(path) + except Exception as e: + logger.warning(f"Failed to remove temp file {path}: {e}") +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return {"status": "healthy", "model_loaded": default_model is not None} def main(): import uvicorn + + # Load default model and keys + load_default_model() get_keys() - uvicorn.run(app, host="0.0.0.0", port=9854, log_level="debug") + + port = int(os.getenv("PORT", 9854)) + host = os.getenv("HOST", "0.0.0.0") + + uvicorn.run(app, host=host, port=port, log_level="info") if __name__ == "__main__": main() diff --git a/docker-compose.yml b/docker-compose.yml index cbc6f67..c6d6ae7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,31 @@ services: - whisper-app: + simple-asr-server: build: . ports: - - "9854:9854" - command: ["python", "app.py"] + - "${PORT:-9854}:9854" + environment: + - HOST=${HOST:-0.0.0.0} + - PORT=${PORT:-9854} + - DEFAULT_MODEL=${DEFAULT_MODEL:-turbo} + - MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-/app/models} + - KEYS_FILE=${KEYS_FILE:-/app/data/keys.txt} + - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-10.3.0} + volumes: + - ./models:/app/models + - ./data:/app/data + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + group_add: + - video + - render + security_opt: + - seccomp:unconfined + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9854/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + diff --git a/requirements.txt b/requirements.txt index 6a0fedf..217c4d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,5 @@ fastapi uvicorn[standard] python-multipart -gigaam -gigaam[longform] -ffmpeg-python -PyYAML -numpy<2.0.0 \ No newline at end of file +openai-whisper +python-dotenv diff --git a/simple-asr-server.service b/simple-asr-server.service new file mode 100644 index 0000000..9fa30a4 --- /dev/null +++ b/simple-asr-server.service @@ -0,0 +1,20 @@ +[Unit] +Description=Whisper ASR Server (ROCM) +After=network.target +Wants=network.target + +[Service] +Type=exec +User=asr +Group=asr +WorkingDirectory=/opt/asr +ExecStart=/opt/asr/start_server.sh +ExecReload=/bin/kill -HUP $MAINPID +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=asr + +[Install] +WantedBy=multi-user.target diff --git a/start_server.sh b/start_server.sh new file mode 100644 index 0000000..f354e2a --- /dev/null +++ b/start_server.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Simple ASR Server startup script for systemd +# This script loads environment variables from .env file and starts the server + +set -e + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +APP_DIR="${SCRIPT_DIR}" + +# Load environment variables from .env file if it exists +if [ -f "${APP_DIR}/.env" ]; then + echo "Loading environment variables from ${APP_DIR}/.env" + set -a # automatically export all variables + source "${APP_DIR}/.env" + set +a +else + echo "Warning: .env file not found at ${APP_DIR}/.env" + echo "Using default environment variables" +fi + +# Set default values if not provided in .env +export HOST=${HOST:-"0.0.0.0"} +export PORT=${PORT:-9854} +export DEFAULT_MODEL=${DEFAULT_MODEL:-"turbo"} +export MODEL_DOWNLOAD_ROOT=${MODEL_DOWNLOAD_ROOT:-"${APP_DIR}/models"} +export KEYS_FILE=${KEYS_FILE:-"${APP_DIR}/keys.txt"} +export LOG_LEVEL=${LOG_LEVEL:-"INFO"} + +# Create necessary directories +mkdir -p "${MODEL_DOWNLOAD_ROOT}" +mkdir -p "$(dirname "${KEYS_FILE}")" + +# Change to app directory +cd "${APP_DIR}" + +echo "Starting Simple ASR Server..." +echo "Host: ${HOST}" +echo "Port: ${PORT}" +echo "Default Model: ${DEFAULT_MODEL}" +echo "Model Download Root: ${MODEL_DOWNLOAD_ROOT}" +echo "Keys File: ${KEYS_FILE}" +echo "Log Level: ${LOG_LEVEL}" + +# Start the application +exec python3 app.py From ce41cf4a09f976701096e47911885d0355981bb2 Mon Sep 17 00:00:00 2001 From: red Date: Wed, 20 Aug 2025 23:25:05 +0900 Subject: [PATCH 4/5] =?UTF-8?q?-=20=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D1=8B=20=D0=BF=D0=B0=D1=80=D0=B0=D0=BC=D0=B5=D1=82?= =?UTF-8?q?=D1=80=D1=8B=20=D0=BC=D0=BE=D0=B4=D0=B5=D0=BB=D0=B8=20=D0=B2=20?= =?UTF-8?q?=D0=B3=D0=B5=D1=82=20=D1=8D=D0=BD=D0=B4=D0=BF=D0=BE=D0=B8=D0=BD?= =?UTF-8?q?=D1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/app.py b/app.py index 4893492..54a955c 100644 --- a/app.py +++ b/app.py @@ -2,7 +2,7 @@ import logging import os import subprocess import tempfile -from typing import Optional +from typing import Optional, Union, List, Tuple from enum import Enum import whisper @@ -102,9 +102,23 @@ async def transcribe_audio( token: str = Depends(api_key_header), model_name: Optional[str] = Query(None, description="Model name to use for transcription"), output_format: OutputFormat = Query(OutputFormat.json, description="Output format: plaintext, simple, or json"), - speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)") + speedup: float = Query(1.0, ge=0.25, le=4.0, description="Speed up factor for audio (0.25-4.0)"), + # Whisper model parameters + verbose: Optional[bool] = Query(None, description="Whether to print out the progress and debug messages"), + temperature: Union[float, str] = Query("0.0,0.2,0.4,0.6,0.8,1.0", description="Temperature for sampling (single float or comma-separated values)"), + compression_ratio_threshold: Optional[float] = Query(2.4, description="If the gzip compression ratio is above this value, treat as failed"), + logprob_threshold: Optional[float] = Query(-1.0, description="If the average log probability over sampled tokens is below this value, treat as failed"), + no_speech_threshold: Optional[float] = Query(0.6, description="If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below logprob_threshold, consider the segment as silent"), + condition_on_previous_text: bool = Query(True, description="If True, the previous output of the model is provided as a prompt for the next window"), + initial_prompt: Optional[str] = Query(None, description="Optional text to provide as a prompt for the first window"), + carry_initial_prompt: bool = Query(False, description="If True, the initial prompt is carried over to the next window"), + word_timestamps: bool = Query(False, description="Extract word-level timestamps using the cross-attention pattern and dynamic time warping"), + prepend_punctuations: str = Query("\"'([{-", description="If word_timestamps is True, merge these punctuation marks with the next word"), + append_punctuations: str = Query("\"'.,:;!?)]}", description="If word_timestamps is True, merge these punctuation marks with the previous word"), + clip_timestamps: Union[str, List[float]] = Query("0", description="Comma-separated list of clip timestamps to use for transcription"), + hallucination_silence_threshold: Optional[float] = Query(None, description="When word_timestamps is True, skip silent periods longer than this threshold (in seconds)"), ): - """Transcribe audio file with configurable output format""" + """Transcribe audio file with configurable output format and comprehensive Whisper parameters""" # Token validation if token not in get_keys(): @@ -142,9 +156,59 @@ async def transcribe_audio( else: audio_file_path = temp_input_path + # Prepare transcription parameters + transcribe_params = {} + + # Handle temperature parameter (can be single value or tuple) + if isinstance(temperature, str) and "," in temperature: + try: + temp_values = [float(x.strip()) for x in temperature.split(",")] + transcribe_params["temperature"] = tuple(temp_values) + except ValueError: + transcribe_params["temperature"] = 0.0 + else: + try: + transcribe_params["temperature"] = float(temperature) + except (ValueError, TypeError): + transcribe_params["temperature"] = 0.0 + + # Handle clip_timestamps parameter + if isinstance(clip_timestamps, str) and clip_timestamps != "0": + try: + if "," in clip_timestamps: + transcribe_params["clip_timestamps"] = [float(x.strip()) for x in clip_timestamps.split(",")] + else: + transcribe_params["clip_timestamps"] = clip_timestamps + except ValueError: + transcribe_params["clip_timestamps"] = "0" + else: + transcribe_params["clip_timestamps"] = clip_timestamps + + # Add other parameters if they are not None + if verbose is not None: + transcribe_params["verbose"] = verbose + if compression_ratio_threshold is not None: + transcribe_params["compression_ratio_threshold"] = compression_ratio_threshold + if logprob_threshold is not None: + transcribe_params["logprob_threshold"] = logprob_threshold + if no_speech_threshold is not None: + transcribe_params["no_speech_threshold"] = no_speech_threshold + + transcribe_params["condition_on_previous_text"] = condition_on_previous_text + transcribe_params["carry_initial_prompt"] = carry_initial_prompt + transcribe_params["word_timestamps"] = word_timestamps + transcribe_params["prepend_punctuations"] = prepend_punctuations + transcribe_params["append_punctuations"] = append_punctuations + + if initial_prompt is not None: + transcribe_params["initial_prompt"] = initial_prompt + if hallucination_silence_threshold is not None: + transcribe_params["hallucination_silence_threshold"] = hallucination_silence_threshold + # Transcribe logger.info("Starting transcription") - result = model.transcribe(audio_file_path) + logger.debug(f"Transcription parameters: {transcribe_params}") + result = model.transcribe(audio_file_path, **transcribe_params) # Format output based on requested format if output_format == OutputFormat.plaintext: @@ -170,7 +234,7 @@ async def transcribe_audio( @app.get("/health") async def health_check(): """Health check endpoint""" - return {"status": "healthy", "model_loaded": default_model is not None} + return {"status": "healthy", "model_loaded": default_model is not None, "model_name": default_model.__str__()} def main(): import uvicorn From d70f2e7089e29d6b3c4888e78adca54927241db0 Mon Sep 17 00:00:00 2001 From: red Date: Wed, 3 Sep 2025 10:50:44 +0300 Subject: [PATCH 5/5] =?UTF-8?q?=D0=9D=D0=B0=D0=B1=D0=BE=D1=80=20=D0=B2?= =?UTF-8?q?=D1=81=D1=8F=D0=BA=D0=B8=D1=85=20=D1=88=D1=82=D1=83=D0=BA=20?= =?UTF-8?q?=D0=B4=D0=BB=D1=8F=20=D0=B4=D0=B5=D0=BF=D0=BB=D0=BE=D1=8F.=20?= =?UTF-8?q?=D0=9F=D1=80=D0=B5=D0=B4=D0=BF=D0=BE=D0=BB=D0=B0=D0=B3=D0=B0?= =?UTF-8?q?=D0=B5=D1=82=D1=81=D1=8F=20=D1=87=D1=82=D0=BE=20ROCM=20=D1=81?= =?UTF-8?q?=D1=82=D0=BE=D0=B8=D1=82=20=D0=BD=D0=B0=20=D1=85=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=B5!?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 19 ++++++++++++++----- .../inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/material_theme_project_new.xml | 12 ++++++++++++ .idea/misc.xml | 7 +++++++ .idea/vcs.xml | 6 ++++++ Dockerfile | 7 ++----- requirements.txt | 2 ++ start_server.sh | 17 +++++++++++++++++ 8 files changed, 66 insertions(+), 10 deletions(-) create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/material_theme_project_new.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/vcs.xml mode change 100644 => 100755 start_server.sh diff --git a/.env.example b/.env.example index 73fcce6..779f616 100644 --- a/.env.example +++ b/.env.example @@ -1,13 +1,22 @@ + # Server configuration HOST=0.0.0.0 PORT=9854 # Model configuration -DEFAULT_MODEL=turbo -MODEL_DOWNLOAD_ROOT=/app/models +DEFAULT_MODEL=tiny +MODEL_DOWNLOAD_ROOT=./models -# API Keys -KEYS_FILE=/app/keys.txt +# Security configuration +KEYS_FILE=keys.txt -# Logging +# Logging configuration (optional) LOG_LEVEL=INFO + +# ROCm GPU configuration +HSA_OVERRIDE_GFX_VERSION=10.3.0 +ROCM_PATH=/opt/rocm + +# Example of available Whisper models: +# tiny, base, small, medium, large, turbo +# turbo is recommended for best speed/quality balance diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/material_theme_project_new.xml b/.idea/material_theme_project_new.xml new file mode 100644 index 0000000..a8dd540 --- /dev/null +++ b/.idea/material_theme_project_new.xml @@ -0,0 +1,12 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..b79d30e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 57398d0..9b71450 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ -# Use ROCm compatible Python image as base -FROM rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2 +# Use official Python image as base +FROM python:3.10-slim # Set working directory WORKDIR /app @@ -31,8 +31,6 @@ RUN mkdir -p /app/models /app/data ENV PYTHONUNBUFFERED=1 ENV MODEL_DOWNLOAD_ROOT=/app/models ENV KEYS_FILE=/app/data/keys.txt -ENV HSA_OVERRIDE_GFX_VERSION=10.3.0 -ENV ROCM_PATH=/opt/rocm # Expose port EXPOSE 9854 @@ -43,4 +41,3 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ # Run the application CMD ["python", "app.py"] - diff --git a/requirements.txt b/requirements.txt index 217c4d8..c3f060d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ uvicorn[standard] python-multipart openai-whisper python-dotenv + + diff --git a/start_server.sh b/start_server.sh old mode 100644 new mode 100755 index f354e2a..2ac7bf9 --- a/start_server.sh +++ b/start_server.sh @@ -32,6 +32,22 @@ export LOG_LEVEL=${LOG_LEVEL:-"INFO"} mkdir -p "${MODEL_DOWNLOAD_ROOT}" mkdir -p "$(dirname "${KEYS_FILE}")" +# Check if virtual environment exists, create if not +VENV_DIR="${APP_DIR}/venv" +if [ ! -d "${VENV_DIR}" ]; then + echo "Creating virtual environment..." + python3 -m venv "${VENV_DIR}" +fi + +# Activate virtual environment +echo "Activating virtual environment..." +source "${VENV_DIR}/bin/activate" + +# Install/upgrade dependencies +echo "Installing/upgrading dependencies..." +pip install --upgrade pip +pip install -r "${APP_DIR}/requirements.txt" + # Change to app directory cd "${APP_DIR}" @@ -45,3 +61,4 @@ echo "Log Level: ${LOG_LEVEL}" # Start the application exec python3 app.py +