Merge branch 'master' into Whisper-Based
This commit is contained in:
@@ -5,7 +5,7 @@ WORKDIR /app
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ffmpeg \
|
||||
python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
python3-venv \
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
|
||||
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Vladimirov Vladislav
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
104
README.md
Normal file
104
README.md
Normal file
@@ -0,0 +1,104 @@
|
||||
BASED ON https://github.com/salute-developers/GigaAM
|
||||
|
||||
# Simple ASR Server
|
||||
|
||||
This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, ensure you have the following installed:
|
||||
|
||||
* [Docker](https://docs.docker.com/get-docker/)
|
||||
* [Docker Compose](https://docs.docker.com/compose/install/)
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
.
|
||||
├── app.py # Main application file with FastAPI endpoint
|
||||
├── docker-compose.yml # Docker Compose configuration
|
||||
├── Dockerfile # Dockerfile for building the application image
|
||||
├── model/ # Directory for Whisper model files
|
||||
└── requirements.txt # Python dependencies
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
1. **Clone the repository:**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/SlavaVlad/simple-asr-server
|
||||
cd simple-asr-server
|
||||
```
|
||||
3. **Add API keys:**
|
||||
|
||||
Create a `keys.txt` file in the root of the project and add your API keys, one per line.
|
||||
|
||||
## Building and Running the Project
|
||||
|
||||
You can build and run the project using Docker Compose.
|
||||
|
||||
1. **Build the Docker image:**
|
||||
|
||||
```bash
|
||||
docker-compose build
|
||||
```
|
||||
|
||||
2. **Run the container:**
|
||||
|
||||
```bash
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
The application will be available at `http://0.0.0.0:9854`.
|
||||
|
||||
## API Endpoint
|
||||
|
||||
### POST /transcribe
|
||||
|
||||
This endpoint accepts an audio file and returns the transcription.
|
||||
|
||||
* **URL:** `/transcribe`
|
||||
* **Method:** `POST`
|
||||
* **Headers:**
|
||||
* `X-API-Key`: Your API key.
|
||||
* **Form Data:**
|
||||
* `file`: The audio file to be transcribed.
|
||||
|
||||
**Example using `curl`:**
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:9854/transcribe" \
|
||||
-H "X-API-Key: YOUR_API_KEY" \
|
||||
-F "file=@/path/to/your/audio.wav"
|
||||
```
|
||||
|
||||
**Successful Response (200 OK):**
|
||||
|
||||
```json
|
||||
{
|
||||
"transcription": [
|
||||
{
|
||||
"start_time": 0.0,
|
||||
"end_time": 2.5,
|
||||
"transcription": "Hello world."
|
||||
}
|
||||
],
|
||||
"text": "Hello world. ",
|
||||
"metrics": {
|
||||
"processing_time": 5.2,
|
||||
"rtf": 0.5,
|
||||
"word_rate": 2.0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Error Response (401 Unauthorized):**
|
||||
|
||||
If the API key is missing or invalid.
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Invalid API Key"
|
||||
}
|
||||
```
|
||||
39
app.py
39
app.py
@@ -2,10 +2,10 @@ import logging
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from os import getenv
|
||||
from typing import Dict
|
||||
from typing import Optional, Union, List, Tuple
|
||||
|
||||
import whisper
|
||||
import gigaam
|
||||
from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
|
||||
from fastapi.security import APIKeyHeader
|
||||
|
||||
@@ -126,21 +126,26 @@ async def transcribe_audio(
|
||||
|
||||
# Transcribe
|
||||
logger.info("Starting transcription")
|
||||
result = model.transcribe(
|
||||
temp_output_path,
|
||||
verbose=verbose,
|
||||
temperature=temperature,
|
||||
compression_ratio_threshold=compression_ratio_threshold,
|
||||
logprob_threshold=logprob_threshold,
|
||||
no_speech_threshold=no_speech_threshold,
|
||||
condition_on_previous_text=condition_on_previous_text,
|
||||
initial_prompt=initial_prompt,
|
||||
word_timestamps=word_timestamps,
|
||||
prepend_punctuations=prepend_punctuations,
|
||||
append_punctuations=append_punctuations,
|
||||
clip_timestamps=clip_timestamps,
|
||||
hallucination_silence_threshold=hallucination_silence_threshold
|
||||
)
|
||||
if original_duration > 30:
|
||||
logger.info("Audio duration > 30 seconds, using transcribe_longform")
|
||||
transcription_result = model.transcribe_longform(
|
||||
temp_output_path
|
||||
)
|
||||
else:
|
||||
logger.info("Audio duration <= 30 seconds, using transcribe")
|
||||
transcription_result = model.transcribe(
|
||||
temp_output_path
|
||||
)
|
||||
|
||||
full_text = ""
|
||||
for part in transcription_result:
|
||||
if part["transcription"].strip() != "":
|
||||
full_text += part["transcription"].strip() + " "
|
||||
|
||||
result = {
|
||||
"transcription": transcription_result,
|
||||
"text": full_text
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -3,8 +3,4 @@ services:
|
||||
build: .
|
||||
ports:
|
||||
- "9854:9854"
|
||||
devices:
|
||||
- "/dev/kfd:/dev/kfd"
|
||||
- "/dev/dri:/dev/dri"
|
||||
group_add:
|
||||
- video
|
||||
command: ["python", "app.py"]
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
python-multipart
|
||||
openai-whisper
|
||||
gigaam
|
||||
gigaam[longform]
|
||||
ffmpeg-python
|
||||
PyYAML
|
||||
numpy<2.0.0
|
||||
Reference in New Issue
Block a user