Merge branch 'master' into Whisper-Based

2025-08-17 23:28:08 +09:00
parent 9eb026b220 f718da13d6
commit 6ce92c130d
6 changed files with 152 additions and 24 deletions
--- a/2
+++ b/2
@@ -5,7 +5,7 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y \
    ffmpeg \
    python3-pip \
-    && rm -rf /var/lib/apt/lists/*
+    python3-venv \

 COPY requirements.txt .
 RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Vladimirov Vladislav
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,104 @@
+BASED ON https://github.com/salute-developers/GigaAM
+
+# Simple ASR Server
+
+This project provides a RESTful API for audio transcription using a Whisper model. The API is built with FastAPI and runs in a Docker container.
+
+## Prerequisites
+
+Before you begin, ensure you have the following installed:
+
+*   [Docker](https://docs.docker.com/get-docker/)
+*   [Docker Compose](https://docs.docker.com/compose/install/)
+
+## Project Structure
+
+```
+.
+├── app.py              # Main application file with FastAPI endpoint
+├── docker-compose.yml  # Docker Compose configuration
+├── Dockerfile          # Dockerfile for building the application image
+├── model/              # Directory for Whisper model files
+└── requirements.txt    # Python dependencies
+```
+
+## Setup
+
+1.  **Clone the repository:**
+
+    ```bash
+    git clone https://github.com/SlavaVlad/simple-asr-server
+    cd simple-asr-server
+    ```
+3.  **Add API keys:**
+
+    Create a `keys.txt` file in the root of the project and add your API keys, one per line.
+
+## Building and Running the Project
+
+You can build and run the project using Docker Compose.
+
+1.  **Build the Docker image:**
+
+    ```bash
+    docker-compose build
+    ```
+
+2.  **Run the container:**
+
+    ```bash
+    docker-compose up
+    ```
+
+    The application will be available at `http://0.0.0.0:9854`.
+
+## API Endpoint
+
+### POST /transcribe
+
+This endpoint accepts an audio file and returns the transcription.
+
+*   **URL:** `/transcribe`
+*   **Method:** `POST`
+*   **Headers:**
+    *   `X-API-Key`: Your API key.
+*   **Form Data:**
+    *   `file`: The audio file to be transcribed.
+
+**Example using `curl`:**
+
+```bash
+curl -X POST "http://localhost:9854/transcribe" \
+     -H "X-API-Key: YOUR_API_KEY" \
+     -F "file=@/path/to/your/audio.wav"
+```
+
+**Successful Response (200 OK):**
+
+```json
+{
+  "transcription": [
+    {
+      "start_time": 0.0,
+      "end_time": 2.5,
+      "transcription": "Hello world."
+    }
+  ],
+  "text": "Hello world. ",
+  "metrics": {
+    "processing_time": 5.2,
+    "rtf": 0.5,
+    "word_rate": 2.0
+  }
+}
+```
+
+**Error Response (401 Unauthorized):**
+
+If the API key is missing or invalid.
+
+```json
+{
+  "detail": "Invalid API Key"
+}
+```
--- a/app.py
+++ b/app.py
@@ -2,10 +2,10 @@ import logging
 import os
 import subprocess
 import time
+from os import getenv
 from typing import Dict
-from typing import Optional, Union, List, Tuple

-import whisper
+import gigaam
 from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
 from fastapi.security import APIKeyHeader

@@ -126,21 +126,26 @@ async def transcribe_audio(

        # Transcribe
        logger.info("Starting transcription")
-        result = model.transcribe(
-            temp_output_path,
-            verbose=verbose,
-            temperature=temperature,
-            compression_ratio_threshold=compression_ratio_threshold,
-            logprob_threshold=logprob_threshold,
-            no_speech_threshold=no_speech_threshold,
-            condition_on_previous_text=condition_on_previous_text,
-            initial_prompt=initial_prompt,
-            word_timestamps=word_timestamps,
-            prepend_punctuations=prepend_punctuations,
-            append_punctuations=append_punctuations,
-            clip_timestamps=clip_timestamps,
-            hallucination_silence_threshold=hallucination_silence_threshold
-        )
+        if original_duration > 30:
+            logger.info("Audio duration > 30 seconds, using transcribe_longform")
+            transcription_result = model.transcribe_longform(
+                temp_output_path
+            )
+        else:
+            logger.info("Audio duration <= 30 seconds, using transcribe")
+            transcription_result = model.transcribe(
+                temp_output_path
+            )
+
+        full_text = ""
+        for part in transcription_result:
+            if part["transcription"].strip() != "":
+                full_text += part["transcription"].strip() + " "
+
+        result = {
+            "transcription": transcription_result,
+            "text": full_text
+        }

        return result

--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,8 +3,4 @@ services:
    build: .
    ports:
      - "9854:9854"
-    devices:
-      - "/dev/kfd:/dev/kfd"
-      - "/dev/dri:/dev/dri"
-    group_add:
-      - video
+    command: ["python", "app.py"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
 fastapi
 uvicorn[standard]
 python-multipart
-openai-whisper
+gigaam
+gigaam[longform]
 ffmpeg-python
 PyYAML
+numpy<2.0.0