commit 303b7b758424f39f3c5ca423bf2a52abb6844846 Author: vladislav Date: Tue Jul 8 22:14:32 2025 +0300 micro server with small ui on 8000 for running faster_whisper diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..b58b603 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,5 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..406175b --- /dev/null +++ b/config.yaml @@ -0,0 +1,9 @@ +server: + host: "0.0.0.0" + port: 8000 + ui: true + +whisper: + model_name: "turbo" + device: "cuda" + compute_type: "int8" \ No newline at end of file diff --git a/converter.py b/converter.py new file mode 100644 index 0000000..7d05819 --- /dev/null +++ b/converter.py @@ -0,0 +1,45 @@ +import ffmpeg +import os +import tempfile +import shutil + +def is_valid_format(file_path: str) -> bool: + """Проверяет, является ли аудиофайл 16kHz моно WAV.""" + try: + probe = ffmpeg.probe(file_path) + audio_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None) + if audio_stream is None: + return False + + return ( + audio_stream.get('codec_name') == 'pcm_s16le' and + audio_stream.get('channels') == 1 and + audio_stream.get('sample_rate') == '16000' + ) + except ffmpeg.Error: + return False + +def convert_to_wav(input_file_path: str) -> tuple[str, bool]: + """ + Конвертирует аудиофайл в 16kHz моно WAV. + Возвращает путь к сконвертированному файлу и флаг, указывающий, была ли выполнена конвертация. + Если файл уже в нужном формате, возвращает исходный путь и False. + """ + if is_valid_format(input_file_path): + return input_file_path, False + + output_file_path = tempfile.mktemp(suffix=".wav") + + try: + ffmpeg.input(input_file_path).output( + output_file_path, + acodec='pcm_s16le', + ac=1, + ar='16k' + ).run(capture_stdout=True, capture_stderr=True) + return output_file_path, True + except ffmpeg.Error as e: + if os.path.exists(output_file_path): + os.remove(output_file_path) + raise e + diff --git a/main.py b/main.py new file mode 100644 index 0000000..d6512c9 --- /dev/null +++ b/main.py @@ -0,0 +1,198 @@ +import os +import tempfile +import sys +import yaml +from typing import Optional, List, Union, Tuple, Iterable +from fastapi import FastAPI, UploadFile, File, Depends +from pydantic import BaseModel +from fastapi.responses import HTMLResponse +from faster_whisper import WhisperModel +from converter import convert_to_wav + +with open("config.yaml", 'r') as f: + config = yaml.safe_load(f) + +app = FastAPI() + +w_config = config['whisper'] + +class TranscriptionOptions(BaseModel): + language: Optional[str] = w_config.get('language') + task: str = w_config.get('task', 'transcribe') + beam_size: int = w_config.get('beam_size', 5) + best_of: int = w_config.get('best_of', 5) + patience: float = w_config.get('patience', 1.0) + length_penalty: float = w_config.get('length_penalty', 1.0) + repetition_penalty: float = w_config.get('repetition_penalty', 1.0) + no_repeat_ngram_size: int = w_config.get('no_repeat_ngram_size', 0) + temperature: Union[float, List[float], Tuple[float, ...]] = w_config.get('temperature', [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) + log_progress: bool = w_config.get('log_progress', False) + compression_ratio_threshold: Optional[float] = w_config.get('compression_ratio_threshold', 2.4) + log_prob_threshold: Optional[float] = w_config.get('log_prob_threshold', -1.0) + no_speech_threshold: Optional[float] = w_config.get('no_speech_threshold', 0.6) + condition_on_previous_text: bool = w_config.get('condition_on_previous_text', True) + prompt_reset_on_temperature: float = w_config.get('prompt_reset_on_temperature', 0.5) + initial_prompt: Optional[Union[str, Iterable[int]]] = w_config.get('initial_prompt') + prefix: Optional[str] = w_config.get('prefix') + suppress_blank: bool = w_config.get('suppress_blank', True) + suppress_tokens: Optional[List[int]] = w_config.get('suppress_tokens', [-1]) + without_timestamps: bool = w_config.get('without_timestamps', False) + max_initial_timestamp: float = w_config.get('max_initial_timestamp', 1.0) + word_timestamps: bool = w_config.get('word_timestamps', False) + prepend_punctuations: str = w_config.get('prepend_punctuations', '"\'“¿([{-') + append_punctuations: str = w_config.get('append_punctuations', '"\'.。,,!!??::”)]}、') + vad_filter: bool = w_config.get('vad_filter', False) + vad_parameters: Optional[dict] = w_config.get('vad_parameters') + max_new_tokens: Optional[int] = w_config.get('max_new_tokens') + chunk_length: Optional[int] = w_config.get('chunk_length') + clip_timestamps: Union[str, List[float]] = w_config.get('clip_timestamps', "0") + hallucination_silence_threshold: Optional[float] = w_config.get('hallucination_silence_threshold') + hotwords: Optional[str] = w_config.get('hotwords') + language_detection_threshold: Optional[float] = w_config.get('language_detection_threshold') + language_detection_segments: int = w_config.get('language_detection_segments', 1) + +class WhisperTranscriber: + def __init__(self, model_name, device, compute_type): + self.model = WhisperModel(model_name, device=device, compute_type=compute_type) + + def transcribe(self, audio_file_path: str, options: dict) -> str: + segments, _ = self.model.transcribe(audio_file_path, **options) + transcription = " ".join([segment.text for segment in segments]) + return transcription + +transcriber = WhisperTranscriber( + model_name=w_config['model_name'], + device=w_config['device'], + compute_type=w_config['compute_type'] +) + +@app.post("/transcribe") +async def transcribe_audio(file: UploadFile = File(...), options: TranscriptionOptions = Depends()): + temp_audio_file_path = None + converted_file_path = None + was_converted = False + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio_file: + temp_audio_file.write(await file.read()) + temp_audio_file_path = temp_audio_file.name + + converted_file_path, was_converted = convert_to_wav(temp_audio_file_path) + + transcription = transcriber.transcribe(converted_file_path, options.dict(exclude_none=True)) + + return {"transcription": transcription} + finally: + if temp_audio_file_path and os.path.exists(temp_audio_file_path): + os.remove(temp_audio_file_path) + if was_converted and converted_file_path and os.path.exists(converted_file_path): + os.remove(converted_file_path) + +def create_ui(): + return ''' + + + + + + Whisper Transcription + + + + +
+
+
+

Upload Audio for Transcription

+
+ +
+
+ +
+

Transcription:

+
+

+                    
+
+
+
+ + + + + ''' + +if __name__ == "__main__": + import uvicorn + + s_config = config['server'] + + if s_config['ui'] or "--ui" in sys.argv: + @app.get("/", response_class=HTMLResponse) + async def read_root(): + return create_ui() + + uvicorn.run( + app, + host=s_config['host'], + port=s_config['port'] + ) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..27a1a64 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +fastapi +uvicorn[standard] +python-multipart +faster-whisper +ffmpeg-python +PyYAML