From d8815b55848bf640ccc90394c1c3f98a79655f3e Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 00:17:20 +0300 Subject: [PATCH 01/11] =?UTF-8?q?=D0=97=D0=B0=D0=BC=D0=B5=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=BC=D0=BE=D0=B4=D0=B5=D0=BB=D0=B8=20ASR=20=D0=BD=D0=B0=20Gig?= =?UTF-8?q?aAM=20(CTC=20v2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 54 +++++++++++++++++++++------------------------- docker-compose.yml | 12 ++++++----- requirements.txt | 1 + 3 files changed, 32 insertions(+), 35 deletions(-) diff --git a/app.py b/app.py index cb47a23..b984f52 100644 --- a/app.py +++ b/app.py @@ -4,8 +4,8 @@ import subprocess import time from typing import Dict from typing import Optional, Union, List, Tuple +import gigaam -import whisper from fastapi import FastAPI, Depends, HTTPException, UploadFile, File from fastapi.security import APIKeyHeader @@ -18,6 +18,7 @@ logger = logging.getLogger(__name__) app = FastAPI() +model = gigaam.load_model("v2_ctc", device="cuda", download_root="./model") # API key header api_key_header = APIKeyHeader(name="x-api-key") @@ -108,19 +109,7 @@ def get_audio_duration(file_path: str) -> float: async def transcribe_audio( file: UploadFile = File(...), token: str = Depends(api_key_header), - model_name: str = "turbo", - verbose: Optional[bool] = None, - temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), - compression_ratio_threshold: Optional[float] = 2.4, - logprob_threshold: Optional[float] = -1.0, - no_speech_threshold: Optional[float] = 0.6, - condition_on_previous_text: bool = True, - initial_prompt: Optional[str] = None, - word_timestamps: bool = False, - prepend_punctuations: str = "\"'\"¿([{-", - append_punctuations: str = "\"\'.。,,!!??::\")]}、", - clip_timestamps: Union[str, List[float]] = "0", - hallucination_silence_threshold: Optional[float] = None + model_name: str = "turbo" ): # Token validation if token not in get_keys(): @@ -148,24 +137,29 @@ async def transcribe_audio( # Transcribe logger.info("Starting transcription") - result = model.transcribe( - temp_output_path, - verbose=verbose, - temperature=temperature, - compression_ratio_threshold=compression_ratio_threshold, - logprob_threshold=logprob_threshold, - no_speech_threshold=no_speech_threshold, - condition_on_previous_text=condition_on_previous_text, - initial_prompt=initial_prompt, - word_timestamps=word_timestamps, - prepend_punctuations=prepend_punctuations, - append_punctuations=append_punctuations, - clip_timestamps=clip_timestamps, - hallucination_silence_threshold=hallucination_silence_threshold - ) + if original_duration > 30: + logger.info("Audio duration > 30 seconds, using transcribe_longform") + transcription_result = model.transcribe_longform( + temp_output_path + ) + else: + logger.info("Audio duration <= 30 seconds, using transcribe") + transcription_result = model.transcribe( + temp_output_path + ) + + full_text = "" + for part in transcription_result: + if part["transcription"].strip() != "": + full_text += part["transcription"].strip() + " " + + result = { + "transcription": transcription_result, + "text": full_text + } # Calculate metrics - metrics.stop(result["text"], original_duration) + metrics.stop(full_text, original_duration) logger.info(f"Transcription metrics: {metrics.get_metrics()}") # Add metrics to result diff --git a/docker-compose.yml b/docker-compose.yml index 9841f63..31db88c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,10 +1,12 @@ +version: '3.8' + services: whisper-app: build: . ports: - "9854:9854" - devices: - - "/dev/kfd:/dev/kfd" - - "/dev/dri:/dev/dri" - group_add: - - video + volumes: + - ./keys.txt:/app/keys.txt + - /tmp:/tmp + command: ["python", "app.py"] + diff --git a/requirements.txt b/requirements.txt index a35e823..7492a97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ python-multipart openai-whisper ffmpeg-python PyYAML +numpy<2.0.0 \ No newline at end of file From d02b1b46d2b5f24819e120ef17f48e9dd98ea317 Mon Sep 17 00:00:00 2001 From: SlavaVlad Date: Tue, 15 Jul 2025 00:26:37 +0300 Subject: [PATCH 02/11] Create README.md --- README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..d343b76 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Not whisper anymore :) From fede1fcf45098637d7d9d63afae739f475abdfc9 Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 00:32:45 +0300 Subject: [PATCH 03/11] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=B8?= =?UTF-8?q?=D0=BB=20=D0=BF=D0=BE=D0=B4=D1=80=D0=BE=D0=B1=D0=BD=D1=8B=D0=B9?= =?UTF-8?q?=20Readme.md=20=D1=81=20=D0=B8=D0=BD=D1=81=D1=82=D1=80=D1=83?= =?UTF-8?q?=D0=BA=D1=86=D0=B8=D1=8F=D0=BC=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d343b76..95fea9e 100644 --- a/README.md +++ b/README.md @@ -1 +1,102 @@ -Not whisper anymore :) +# Whisper Transcription API + +This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container. + +## Prerequisites + +Before you begin, ensure you have the following installed: + +* [Docker](https://docs.docker.com/get-docker/) +* [Docker Compose](https://docs.docker.com/compose/install/) + +## Project Structure + +``` +. +├── app.py # Main application file with FastAPI endpoint +├── docker-compose.yml # Docker Compose configuration +├── Dockerfile # Dockerfile for building the application image +├── model/ # Directory for Whisper model files +└── requirements.txt # Python dependencies +``` + +## Setup + +1. **Clone the repository:** + + ```bash + git clone https://github.com/SlavaVlad/faster-whisper-api + cd faster-whisper-api + ``` +3. **Add API keys:** + + Create a `keys.txt` file in the root of the project and add your API keys, one per line. + +## Building and Running the Project + +You can build and run the project using Docker Compose. + +1. **Build the Docker image:** + + ```bash + docker-compose build + ``` + +2. **Run the container:** + + ```bash + docker-compose up + ``` + + The application will be available at `http://0.0.0.0:9854`. + +## API Endpoint + +### POST /transcribe + +This endpoint accepts an audio file and returns the transcription. + +* **URL:** `/transcribe` +* **Method:** `POST` +* **Headers:** + * `X-API-Key`: Your API key. +* **Form Data:** + * `file`: The audio file to be transcribed. + +**Example using `curl`:** + +```bash +curl -X POST "http://localhost:9854/transcribe" \ + -H "X-API-Key: YOUR_API_KEY" \ + -F "file=@/path/to/your/audio.wav" +``` + +**Successful Response (200 OK):** + +```json +{ + "transcription": [ + { + "start_time": 0.0, + "end_time": 2.5, + "transcription": "Hello world." + } + ], + "text": "Hello world. ", + "metrics": { + "processing_time": 5.2, + "rtf": 0.5, + "word_rate": 2.0 + } +} +``` + +**Error Response (401 Unauthorized):** + +If the API key is missing or invalid. + +```json +{ + "detail": "Invalid API Key" +} +``` From b75f83130a36792a549558e8a7701e227dec1574 Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 00:34:02 +0300 Subject: [PATCH 04/11] =?UTF-8?q?=D0=9E=D0=B1=D0=BD=D0=BE=D0=B2=D0=B8?= =?UTF-8?q?=D0=BB=20=D1=81=D1=81=D1=8B=D0=BB=D0=BA=D1=83=20=D0=BD=D0=B0=20?= =?UTF-8?q?=D1=80=D0=B5=D0=BF=D0=BE=D0=B7=D0=B8=D1=82=D0=BE=D1=80=D0=B8?= =?UTF-8?q?=D0=B9=20=D0=B2=20README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 95fea9e..54338a1 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ Before you begin, ensure you have the following installed: 1. **Clone the repository:** ```bash - git clone https://github.com/SlavaVlad/faster-whisper-api - cd faster-whisper-api + git clone https://github.com/SlavaVlad/simple-asr-server + cd simple-asr-server ``` 3. **Add API keys:** From 91495aac0f455806e350eafafb8dcda1f279c9d0 Mon Sep 17 00:00:00 2001 From: SlavaVlad Date: Tue, 15 Jul 2025 00:38:19 +0300 Subject: [PATCH 05/11] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d9e5e5d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Vladimirov Vladislav + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From e47e8ec40265dab8ddb500d6c0de86ac5ebfe0a8 Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 01:13:07 +0300 Subject: [PATCH 06/11] fix: requirements.txt updated fix: removed version attribute from docker-compose.yml fix: Dockerfile more correct --- Dockerfile | 2 +- docker-compose.yml | 6 ------ requirements.txt | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index d78ae68..2f7749a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /app RUN apt-get update && apt-get install -y \ ffmpeg \ python3-pip \ - && rm -rf /var/lib/apt/lists/* + python3-venv \ COPY requirements.txt . RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt diff --git a/docker-compose.yml b/docker-compose.yml index 31db88c..cbc6f67 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,12 +1,6 @@ -version: '3.8' - services: whisper-app: build: . ports: - "9854:9854" - volumes: - - ./keys.txt:/app/keys.txt - - /tmp:/tmp command: ["python", "app.py"] - diff --git a/requirements.txt b/requirements.txt index 7492a97..dda53b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ fastapi uvicorn[standard] python-multipart -openai-whisper +gigaam ffmpeg-python PyYAML numpy<2.0.0 \ No newline at end of file From c2b060c8a69583833b28cd55dd2399ee790d91f9 Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 01:14:22 +0300 Subject: [PATCH 07/11] Added GigaAM attribution to README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 54338a1..46e9195 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ -# Whisper Transcription API +BASED ON https://github.com/salute-developers/GigaAM + +# Simple ASR Server + +![GitHub Repo stars](https://img.shields.io/github/stars/SlavaVlad/simple-asr-server?style=social) This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container. From 5f9e6bafaa238adc5052deed3ca19a25e1bdce16 Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 01:14:22 +0300 Subject: [PATCH 08/11] Added GigaAM attribution to README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 54338a1..46e9195 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ -# Whisper Transcription API +BASED ON https://github.com/salute-developers/GigaAM + +# Simple ASR Server + +![GitHub Repo stars](https://img.shields.io/github/stars/SlavaVlad/simple-asr-server?style=social) This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container. From aa1d21f9bc8f33cd258d9c6622e1b6411413b4ff Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 01:15:30 +0300 Subject: [PATCH 09/11] Added GigaAM attribution to README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 46e9195..72464ed 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,6 @@ BASED ON https://github.com/salute-developers/GigaAM # Simple ASR Server -![GitHub Repo stars](https://img.shields.io/github/stars/SlavaVlad/simple-asr-server?style=social) - This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container. ## Prerequisites From e7f7120f27815b4121efe66be002753725104b5c Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 02:13:35 +0300 Subject: [PATCH 10/11] =?UTF-8?q?=D0=94=D0=B5=D0=BB=D0=B0=D0=B5=D0=BC=20HI?= =?UTF-8?q?P=20=D1=81=D0=BE=D0=B2=D0=BC=D0=B5=D1=81=D1=82=D0=B8=D0=BC?= =?UTF-8?q?=D0=BE=D1=81=D1=82=D1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index b984f52..781a047 100644 --- a/app.py +++ b/app.py @@ -2,10 +2,10 @@ import logging import os import subprocess import time +from os import getenv from typing import Dict -from typing import Optional, Union, List, Tuple -import gigaam +import gigaam from fastapi import FastAPI, Depends, HTTPException, UploadFile, File from fastapi.security import APIKeyHeader @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) app = FastAPI() -model = gigaam.load_model("v2_ctc", device="cuda", download_root="./model") +model = gigaam.load_model("v2_ctc", device=getenv("ASR_DEVICE"), download_root=getenv("ASR_MODELS_ROOT")) # API key header api_key_header = APIKeyHeader(name="x-api-key") From f718da13d6af706391b1f4f4f4fde62005a21071 Mon Sep 17 00:00:00 2001 From: vladislav Date: Tue, 15 Jul 2025 02:35:58 +0300 Subject: [PATCH 11/11] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=B8?= =?UTF-8?q?=D0=BB=20longform=20=D0=B7=D0=B0=D0=B2=D0=B8=D1=81=D0=B8=D0=BC?= =?UTF-8?q?=D0=BE=D1=81=D1=82=D1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index dda53b3..6a0fedf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ fastapi uvicorn[standard] python-multipart gigaam +gigaam[longform] ffmpeg-python PyYAML numpy<2.0.0 \ No newline at end of file