diff --git a/Dockerfile b/Dockerfile index d78ae68..2f7749a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /app RUN apt-get update && apt-get install -y \ ffmpeg \ python3-pip \ - && rm -rf /var/lib/apt/lists/* + python3-venv \ COPY requirements.txt . RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d9e5e5d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Vladimirov Vladislav + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..72464ed --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +BASED ON https://github.com/salute-developers/GigaAM + +# Simple ASR Server + +This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container. + +## Prerequisites + +Before you begin, ensure you have the following installed: + +* [Docker](https://docs.docker.com/get-docker/) +* [Docker Compose](https://docs.docker.com/compose/install/) + +## Project Structure + +``` +. +├── app.py # Main application file with FastAPI endpoint +├── docker-compose.yml # Docker Compose configuration +├── Dockerfile # Dockerfile for building the application image +├── model/ # Directory for Whisper model files +└── requirements.txt # Python dependencies +``` + +## Setup + +1. **Clone the repository:** + + ```bash + git clone https://github.com/SlavaVlad/simple-asr-server + cd simple-asr-server + ``` +3. **Add API keys:** + + Create a `keys.txt` file in the root of the project and add your API keys, one per line. + +## Building and Running the Project + +You can build and run the project using Docker Compose. + +1. **Build the Docker image:** + + ```bash + docker-compose build + ``` + +2. **Run the container:** + + ```bash + docker-compose up + ``` + + The application will be available at `http://0.0.0.0:9854`. + +## API Endpoint + +### POST /transcribe + +This endpoint accepts an audio file and returns the transcription. + +* **URL:** `/transcribe` +* **Method:** `POST` +* **Headers:** + * `X-API-Key`: Your API key. +* **Form Data:** + * `file`: The audio file to be transcribed. + +**Example using `curl`:** + +```bash +curl -X POST "http://localhost:9854/transcribe" \ + -H "X-API-Key: YOUR_API_KEY" \ + -F "file=@/path/to/your/audio.wav" +``` + +**Successful Response (200 OK):** + +```json +{ + "transcription": [ + { + "start_time": 0.0, + "end_time": 2.5, + "transcription": "Hello world." + } + ], + "text": "Hello world. ", + "metrics": { + "processing_time": 5.2, + "rtf": 0.5, + "word_rate": 2.0 + } +} +``` + +**Error Response (401 Unauthorized):** + +If the API key is missing or invalid. + +```json +{ + "detail": "Invalid API Key" +} +``` diff --git a/app.py b/app.py index f5cf989..cca21a8 100644 --- a/app.py +++ b/app.py @@ -2,10 +2,10 @@ import logging import os import subprocess import time +from os import getenv from typing import Dict -from typing import Optional, Union, List, Tuple -import whisper +import gigaam from fastapi import FastAPI, Depends, HTTPException, UploadFile, File from fastapi.security import APIKeyHeader @@ -126,21 +126,26 @@ async def transcribe_audio( # Transcribe logger.info("Starting transcription") - result = model.transcribe( - temp_output_path, - verbose=verbose, - temperature=temperature, - compression_ratio_threshold=compression_ratio_threshold, - logprob_threshold=logprob_threshold, - no_speech_threshold=no_speech_threshold, - condition_on_previous_text=condition_on_previous_text, - initial_prompt=initial_prompt, - word_timestamps=word_timestamps, - prepend_punctuations=prepend_punctuations, - append_punctuations=append_punctuations, - clip_timestamps=clip_timestamps, - hallucination_silence_threshold=hallucination_silence_threshold - ) + if original_duration > 30: + logger.info("Audio duration > 30 seconds, using transcribe_longform") + transcription_result = model.transcribe_longform( + temp_output_path + ) + else: + logger.info("Audio duration <= 30 seconds, using transcribe") + transcription_result = model.transcribe( + temp_output_path + ) + + full_text = "" + for part in transcription_result: + if part["transcription"].strip() != "": + full_text += part["transcription"].strip() + " " + + result = { + "transcription": transcription_result, + "text": full_text + } return result diff --git a/docker-compose.yml b/docker-compose.yml index 9841f63..cbc6f67 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,8 +3,4 @@ services: build: . ports: - "9854:9854" - devices: - - "/dev/kfd:/dev/kfd" - - "/dev/dri:/dev/dri" - group_add: - - video + command: ["python", "app.py"] diff --git a/requirements.txt b/requirements.txt index a35e823..6a0fedf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ fastapi uvicorn[standard] python-multipart -openai-whisper +gigaam +gigaam[longform] ffmpeg-python PyYAML +numpy<2.0.0 \ No newline at end of file