From d8815b55848bf640ccc90394c1c3f98a79655f3e Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 00:17:20 +0300 Subject: [PATCH] =?UTF-8?q?=D0=97=D0=B0=D0=BC=D0=B5=D0=BD=D0=B0=20=D0=BC?= =?UTF-8?q?=D0=BE=D0=B4=D0=B5=D0=BB=D0=B8=20ASR=20=D0=BD=D0=B0=20GigaAM=20?= =?UTF-8?q?(CTC=20v2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 54 +++++++++++++++++++++------------------------- docker-compose.yml | 12 ++++++----- requirements.txt | 1 + 3 files changed, 32 insertions(+), 35 deletions(-) diff --git a/app.py b/app.py index cb47a23..b984f52 100644 --- a/app.py +++ b/app.py @@ -4,8 +4,8 @@ import subprocess import time from typing import Dict from typing import Optional, Union, List, Tuple +import gigaam -import whisper from fastapi import FastAPI, Depends, HTTPException, UploadFile, File from fastapi.security import APIKeyHeader @@ -18,6 +18,7 @@ logger = logging.getLogger(__name__) app = FastAPI() +model = gigaam.load_model("v2_ctc", device="cuda", download_root="./model") # API key header api_key_header = APIKeyHeader(name="x-api-key") @@ -108,19 +109,7 @@ def get_audio_duration(file_path: str) -> float: async def transcribe_audio( file: UploadFile = File(...), token: str = Depends(api_key_header), - model_name: str = "turbo", - verbose: Optional[bool] = None, - temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), - compression_ratio_threshold: Optional[float] = 2.4, - logprob_threshold: Optional[float] = -1.0, - no_speech_threshold: Optional[float] = 0.6, - condition_on_previous_text: bool = True, - initial_prompt: Optional[str] = None, - word_timestamps: bool = False, - prepend_punctuations: str = "\"'\"¿([{-", - append_punctuations: str = "\"\'.。,,!!??::\")]}、", - clip_timestamps: Union[str, List[float]] = "0", - hallucination_silence_threshold: Optional[float] = None + model_name: str = "turbo" ): # Token validation if token not in get_keys(): @@ -148,24 +137,29 @@ async def transcribe_audio( # Transcribe logger.info("Starting transcription") - result = model.transcribe( - temp_output_path, - verbose=verbose, - temperature=temperature, - compression_ratio_threshold=compression_ratio_threshold, - logprob_threshold=logprob_threshold, - no_speech_threshold=no_speech_threshold, - condition_on_previous_text=condition_on_previous_text, - initial_prompt=initial_prompt, - word_timestamps=word_timestamps, - prepend_punctuations=prepend_punctuations, - append_punctuations=append_punctuations, - clip_timestamps=clip_timestamps, - hallucination_silence_threshold=hallucination_silence_threshold - ) + if original_duration > 30: + logger.info("Audio duration > 30 seconds, using transcribe_longform") + transcription_result = model.transcribe_longform( + temp_output_path + ) + else: + logger.info("Audio duration <= 30 seconds, using transcribe") + transcription_result = model.transcribe( + temp_output_path + ) + + full_text = "" + for part in transcription_result: + if part["transcription"].strip() != "": + full_text += part["transcription"].strip() + " " + + result = { + "transcription": transcription_result, + "text": full_text + } # Calculate metrics - metrics.stop(result["text"], original_duration) + metrics.stop(full_text, original_duration) logger.info(f"Transcription metrics: {metrics.get_metrics()}") # Add metrics to result diff --git a/docker-compose.yml b/docker-compose.yml index 9841f63..31db88c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,10 +1,12 @@ +version: '3.8' + services: whisper-app: build: . ports: - "9854:9854" - devices: - - "/dev/kfd:/dev/kfd" - - "/dev/dri:/dev/dri" - group_add: - - video + volumes: + - ./keys.txt:/app/keys.txt + - /tmp:/tmp + command: ["python", "app.py"] + diff --git a/requirements.txt b/requirements.txt index a35e823..7492a97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ python-multipart openai-whisper ffmpeg-python PyYAML +numpy<2.0.0 \ No newline at end of file