micro server with small ui on 8000 for running faster_whisper

2025-07-08 22:14:32 +03:00
commit 303b7b7584
5 changed files with 263 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,5 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/config.yaml
+++ b/config.yaml
@@ -0,0 +1,9 @@
+server:
+  host: "0.0.0.0"
+  port: 8000
+  ui: true
+
+whisper:
+  model_name: "turbo"
+  device: "cuda"
+  compute_type: "int8"
--- a/converter.py
+++ b/converter.py
@@ -0,0 +1,45 @@
+import ffmpeg
+import os
+import tempfile
+import shutil
+
+def is_valid_format(file_path: str) -> bool:
+    """Проверяет, является ли аудиофайл 16kHz моно WAV."""
+    try:
+        probe = ffmpeg.probe(file_path)
+        audio_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
+        if audio_stream is None:
+            return False
+
+        return (
+            audio_stream.get('codec_name') == 'pcm_s16le' and
+            audio_stream.get('channels') == 1 and
+            audio_stream.get('sample_rate') == '16000'
+        )
+    except ffmpeg.Error:
+        return False
+
+def convert_to_wav(input_file_path: str) -> tuple[str, bool]:
+    """
+    Конвертирует аудиофайл в 16kHz моно WAV.
+    Возвращает путь к сконвертированному файлу и флаг, указывающий, была ли выполнена конвертация.
+    Если файл уже в нужном формате, возвращает исходный путь и False.
+    """
+    if is_valid_format(input_file_path):
+        return input_file_path, False
+
+    output_file_path = tempfile.mktemp(suffix=".wav")
+
+    try:
+        ffmpeg.input(input_file_path).output(
+            output_file_path,
+            acodec='pcm_s16le',
+            ac=1,
+            ar='16k'
+        ).run(capture_stdout=True, capture_stderr=True)
+        return output_file_path, True
+    except ffmpeg.Error as e:
+        if os.path.exists(output_file_path):
+            os.remove(output_file_path)
+        raise e
+
--- a/main.py
+++ b/main.py
@@ -0,0 +1,198 @@
+import os
+import tempfile
+import sys
+import yaml
+from typing import Optional, List, Union, Tuple, Iterable
+from fastapi import FastAPI, UploadFile, File, Depends
+from pydantic import BaseModel
+from fastapi.responses import HTMLResponse
+from faster_whisper import WhisperModel
+from converter import convert_to_wav
+
+with open("config.yaml", 'r') as f:
+    config = yaml.safe_load(f)
+
+app = FastAPI()
+
+w_config = config['whisper']
+
+class TranscriptionOptions(BaseModel):
+    language: Optional[str] = w_config.get('language')
+    task: str = w_config.get('task', 'transcribe')
+    beam_size: int = w_config.get('beam_size', 5)
+    best_of: int = w_config.get('best_of', 5)
+    patience: float = w_config.get('patience', 1.0)
+    length_penalty: float = w_config.get('length_penalty', 1.0)
+    repetition_penalty: float = w_config.get('repetition_penalty', 1.0)
+    no_repeat_ngram_size: int = w_config.get('no_repeat_ngram_size', 0)
+    temperature: Union[float, List[float], Tuple[float, ...]] = w_config.get('temperature', [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
+    log_progress: bool = w_config.get('log_progress', False)
+    compression_ratio_threshold: Optional[float] = w_config.get('compression_ratio_threshold', 2.4)
+    log_prob_threshold: Optional[float] = w_config.get('log_prob_threshold', -1.0)
+    no_speech_threshold: Optional[float] = w_config.get('no_speech_threshold', 0.6)
+    condition_on_previous_text: bool = w_config.get('condition_on_previous_text', True)
+    prompt_reset_on_temperature: float = w_config.get('prompt_reset_on_temperature', 0.5)
+    initial_prompt: Optional[Union[str, Iterable[int]]] = w_config.get('initial_prompt')
+    prefix: Optional[str] = w_config.get('prefix')
+    suppress_blank: bool = w_config.get('suppress_blank', True)
+    suppress_tokens: Optional[List[int]] = w_config.get('suppress_tokens', [-1])
+    without_timestamps: bool = w_config.get('without_timestamps', False)
+    max_initial_timestamp: float = w_config.get('max_initial_timestamp', 1.0)
+    word_timestamps: bool = w_config.get('word_timestamps', False)
+    prepend_punctuations: str = w_config.get('prepend_punctuations', '"\'“¿([{-')
+    append_punctuations: str = w_config.get('append_punctuations', '"\'.。,，!！?？:：”)]}、')
+    vad_filter: bool = w_config.get('vad_filter', False)
+    vad_parameters: Optional[dict] = w_config.get('vad_parameters')
+    max_new_tokens: Optional[int] = w_config.get('max_new_tokens')
+    chunk_length: Optional[int] = w_config.get('chunk_length')
+    clip_timestamps: Union[str, List[float]] = w_config.get('clip_timestamps', "0")
+    hallucination_silence_threshold: Optional[float] = w_config.get('hallucination_silence_threshold')
+    hotwords: Optional[str] = w_config.get('hotwords')
+    language_detection_threshold: Optional[float] = w_config.get('language_detection_threshold')
+    language_detection_segments: int = w_config.get('language_detection_segments', 1)
+
+class WhisperTranscriber:
+    def __init__(self, model_name, device, compute_type):
+        self.model = WhisperModel(model_name, device=device, compute_type=compute_type)
+
+    def transcribe(self, audio_file_path: str, options: dict) -> str:
+        segments, _ = self.model.transcribe(audio_file_path, **options)
+        transcription = " ".join([segment.text for segment in segments])
+        return transcription
+
+transcriber = WhisperTranscriber(
+    model_name=w_config['model_name'],
+    device=w_config['device'],
+    compute_type=w_config['compute_type']
+)
+
+@app.post("/transcribe")
+async def transcribe_audio(file: UploadFile = File(...), options: TranscriptionOptions = Depends()):
+    temp_audio_file_path = None
+    converted_file_path = None
+    was_converted = False
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio_file:
+            temp_audio_file.write(await file.read())
+            temp_audio_file_path = temp_audio_file.name
+
+        converted_file_path, was_converted = convert_to_wav(temp_audio_file_path)
+
+        transcription = transcriber.transcribe(converted_file_path, options.dict(exclude_none=True))
+
+        return {"transcription": transcription}
+    finally:
+        if temp_audio_file_path and os.path.exists(temp_audio_file_path):
+            os.remove(temp_audio_file_path)
+        if was_converted and converted_file_path and os.path.exists(converted_file_path):
+            os.remove(converted_file_path)
+
+def create_ui():
+    return '''
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>Whisper Transcription</title>
+        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
+        <style>
+            body {
+                background-color: #f8f9fa;
+            }
+            .container {
+                max-width: 700px;
+            }
+            #transcriptionOutput {
+                white-space: pre-wrap;
+                word-wrap: break-word;
+            }
+        </style>
+    </head>
+    <body>
+        <div class="container mt-5">
+            <div class="card">
+                <div class="card-body">
+                    <h1 class="card-title text-center mb-4">Upload Audio for Transcription</h1>
+                    <div class="mb-3">
+                        <input class="form-control" type="file" id="audioFile" accept="audio/*">
+                    </div>
+                    <div class="d-grid">
+                        <button class="btn btn-primary" onclick="transcribeAudio()">
+                            <span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true" id="spinner"></span>
+                            Transcribe
+                        </button>
+                    </div>
+                    <h2 class="mt-4">Transcription:</h2>
+                    <div class="p-3 bg-light rounded">
+                        <pre id="transcriptionOutput"></pre>
+                    </div>
+                </div>
+            </div>
+        </div>
+
+        <script>
+            async function transcribeAudio() {
+                const fileInput = document.getElementById('audioFile');
+                const file = fileInput.files[0];
+                if (!file) {
+                    alert("Please select a file first.");
+                    return;
+                }
+
+                const formData = new FormData();
+                formData.append('file', file);
+
+                const outputElement = document.getElementById('transcriptionOutput');
+                const spinner = document.getElementById('spinner');
+                const transcribeButton = document.querySelector('button');
+
+                outputElement.innerText = '';
+                spinner.classList.remove('d-none');
+                transcribeButton.disabled = true;
+
+
+                try {
+                    const response = await fetch('/transcribe', {
+                        method: 'POST',
+                        body: formData
+                    });
+
+                    if (response.ok) {
+                        const result = await response.json();
+                        if (result.transcription) {
+                            outputElement.innerText = result.transcription;
+                        } else if (result.error) {
+                            outputElement.innerText = 'Error: ' + result.error;
+                        }
+                    } else {
+                        const errorText = await response.text();
+                        outputElement.innerText = 'Error: ' + response.statusText + ' - ' + errorText;
+                    }
+                } catch (error) {
+                    outputElement.innerText = 'An error occurred: ' + error;
+                } finally {
+                    spinner.classList.add('d-none');
+                    transcribeButton.disabled = false;
+                }
+            }
+        </script>
+    </body>
+    </html>
+    '''
+
+if __name__ == "__main__":
+    import uvicorn
+
+    s_config = config['server']
+
+    if s_config['ui'] or "--ui" in sys.argv:
+        @app.get("/", response_class=HTMLResponse)
+        async def read_root():
+            return create_ui()
+
+    uvicorn.run(
+        app,
+        host=s_config['host'],
+        port=s_config['port']
+    )
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+fastapi
+uvicorn[standard]
+python-multipart
+faster-whisper
+ffmpeg-python
+PyYAML