Merge branch 'master' into Whisper-Based
This commit is contained in:
@@ -5,7 +5,7 @@ WORKDIR /app
|
|||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
python3-venv \
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
|
RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
|
||||||
|
|||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 Vladimirov Vladislav
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
104
README.md
Normal file
104
README.md
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
BASED ON https://github.com/salute-developers/GigaAM
|
||||||
|
|
||||||
|
# Simple ASR Server
|
||||||
|
|
||||||
|
This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before you begin, ensure you have the following installed:
|
||||||
|
|
||||||
|
* [Docker](https://docs.docker.com/get-docker/)
|
||||||
|
* [Docker Compose](https://docs.docker.com/compose/install/)
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── app.py # Main application file with FastAPI endpoint
|
||||||
|
├── docker-compose.yml # Docker Compose configuration
|
||||||
|
├── Dockerfile # Dockerfile for building the application image
|
||||||
|
├── model/ # Directory for Whisper model files
|
||||||
|
└── requirements.txt # Python dependencies
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. **Clone the repository:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/SlavaVlad/simple-asr-server
|
||||||
|
cd simple-asr-server
|
||||||
|
```
|
||||||
|
3. **Add API keys:**
|
||||||
|
|
||||||
|
Create a `keys.txt` file in the root of the project and add your API keys, one per line.
|
||||||
|
|
||||||
|
## Building and Running the Project
|
||||||
|
|
||||||
|
You can build and run the project using Docker Compose.
|
||||||
|
|
||||||
|
1. **Build the Docker image:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose build
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Run the container:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up
|
||||||
|
```
|
||||||
|
|
||||||
|
The application will be available at `http://0.0.0.0:9854`.
|
||||||
|
|
||||||
|
## API Endpoint
|
||||||
|
|
||||||
|
### POST /transcribe
|
||||||
|
|
||||||
|
This endpoint accepts an audio file and returns the transcription.
|
||||||
|
|
||||||
|
* **URL:** `/transcribe`
|
||||||
|
* **Method:** `POST`
|
||||||
|
* **Headers:**
|
||||||
|
* `X-API-Key`: Your API key.
|
||||||
|
* **Form Data:**
|
||||||
|
* `file`: The audio file to be transcribed.
|
||||||
|
|
||||||
|
**Example using `curl`:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://localhost:9854/transcribe" \
|
||||||
|
-H "X-API-Key: YOUR_API_KEY" \
|
||||||
|
-F "file=@/path/to/your/audio.wav"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Successful Response (200 OK):**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"transcription": [
|
||||||
|
{
|
||||||
|
"start_time": 0.0,
|
||||||
|
"end_time": 2.5,
|
||||||
|
"transcription": "Hello world."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"text": "Hello world. ",
|
||||||
|
"metrics": {
|
||||||
|
"processing_time": 5.2,
|
||||||
|
"rtf": 0.5,
|
||||||
|
"word_rate": 2.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Error Response (401 Unauthorized):**
|
||||||
|
|
||||||
|
If the API key is missing or invalid.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"detail": "Invalid API Key"
|
||||||
|
}
|
||||||
|
```
|
||||||
39
app.py
39
app.py
@@ -2,10 +2,10 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
from os import getenv
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from typing import Optional, Union, List, Tuple
|
|
||||||
|
|
||||||
import whisper
|
import gigaam
|
||||||
from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
|
from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
|
||||||
from fastapi.security import APIKeyHeader
|
from fastapi.security import APIKeyHeader
|
||||||
|
|
||||||
@@ -126,21 +126,26 @@ async def transcribe_audio(
|
|||||||
|
|
||||||
# Transcribe
|
# Transcribe
|
||||||
logger.info("Starting transcription")
|
logger.info("Starting transcription")
|
||||||
result = model.transcribe(
|
if original_duration > 30:
|
||||||
temp_output_path,
|
logger.info("Audio duration > 30 seconds, using transcribe_longform")
|
||||||
verbose=verbose,
|
transcription_result = model.transcribe_longform(
|
||||||
temperature=temperature,
|
temp_output_path
|
||||||
compression_ratio_threshold=compression_ratio_threshold,
|
)
|
||||||
logprob_threshold=logprob_threshold,
|
else:
|
||||||
no_speech_threshold=no_speech_threshold,
|
logger.info("Audio duration <= 30 seconds, using transcribe")
|
||||||
condition_on_previous_text=condition_on_previous_text,
|
transcription_result = model.transcribe(
|
||||||
initial_prompt=initial_prompt,
|
temp_output_path
|
||||||
word_timestamps=word_timestamps,
|
)
|
||||||
prepend_punctuations=prepend_punctuations,
|
|
||||||
append_punctuations=append_punctuations,
|
full_text = ""
|
||||||
clip_timestamps=clip_timestamps,
|
for part in transcription_result:
|
||||||
hallucination_silence_threshold=hallucination_silence_threshold
|
if part["transcription"].strip() != "":
|
||||||
)
|
full_text += part["transcription"].strip() + " "
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"transcription": transcription_result,
|
||||||
|
"text": full_text
|
||||||
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,4 @@ services:
|
|||||||
build: .
|
build: .
|
||||||
ports:
|
ports:
|
||||||
- "9854:9854"
|
- "9854:9854"
|
||||||
devices:
|
command: ["python", "app.py"]
|
||||||
- "/dev/kfd:/dev/kfd"
|
|
||||||
- "/dev/dri:/dev/dri"
|
|
||||||
group_add:
|
|
||||||
- video
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
fastapi
|
fastapi
|
||||||
uvicorn[standard]
|
uvicorn[standard]
|
||||||
python-multipart
|
python-multipart
|
||||||
openai-whisper
|
gigaam
|
||||||
|
gigaam[longform]
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
PyYAML
|
PyYAML
|
||||||
|
numpy<2.0.0
|
||||||
Reference in New Issue
Block a user