This commit is contained in:
2025-07-11 19:17:33 +03:00
parent 303b7b7584
commit 7bb3cc408f
5 changed files with 29 additions and 253 deletions

28
Dockerfile Normal file
View File

@@ -0,0 +1,28 @@
# Используем образ ROCm с предустановленным PyTorch
FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0
# Устанавливаем рабочую директорию в контейнере
WORKDIR /app
# Устанавливаем системные зависимости
RUN apt-get update && apt-get install -y \
ffmpeg \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
# Устанавливаем зависимости Python
COPY requirements.txt .
RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
# Копируем остальные файлы приложения
COPY . .
# Открываем порт, на котором будет работать приложение
EXPOSE 9854
# Устанавливаем переменные окружения для ROCm
ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
ENV PYTORCH_ROCM_ARCH=gfx1030
# Команда для запуска приложения
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9854", "--log-level", "debug"]

View File

@@ -1,9 +0,0 @@
server:
host: "0.0.0.0"
port: 8000
ui: true
whisper:
model_name: "turbo"
device: "cuda"
compute_type: "int8"

View File

@@ -1,45 +0,0 @@
import ffmpeg
import os
import tempfile
import shutil
def is_valid_format(file_path: str) -> bool:
"""Проверяет, является ли аудиофайл 16kHz моно WAV."""
try:
probe = ffmpeg.probe(file_path)
audio_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
if audio_stream is None:
return False
return (
audio_stream.get('codec_name') == 'pcm_s16le' and
audio_stream.get('channels') == 1 and
audio_stream.get('sample_rate') == '16000'
)
except ffmpeg.Error:
return False
def convert_to_wav(input_file_path: str) -> tuple[str, bool]:
"""
Конвертирует аудиофайл в 16kHz моно WAV.
Возвращает путь к сконвертированному файлу и флаг, указывающий, была ли выполнена конвертация.
Если файл уже в нужном формате, возвращает исходный путь и False.
"""
if is_valid_format(input_file_path):
return input_file_path, False
output_file_path = tempfile.mktemp(suffix=".wav")
try:
ffmpeg.input(input_file_path).output(
output_file_path,
acodec='pcm_s16le',
ac=1,
ar='16k'
).run(capture_stdout=True, capture_stderr=True)
return output_file_path, True
except ffmpeg.Error as e:
if os.path.exists(output_file_path):
os.remove(output_file_path)
raise e

198
main.py
View File

@@ -1,198 +0,0 @@
import os
import tempfile
import sys
import yaml
from typing import Optional, List, Union, Tuple, Iterable
from fastapi import FastAPI, UploadFile, File, Depends
from pydantic import BaseModel
from fastapi.responses import HTMLResponse
from faster_whisper import WhisperModel
from converter import convert_to_wav
with open("config.yaml", 'r') as f:
config = yaml.safe_load(f)
app = FastAPI()
w_config = config['whisper']
class TranscriptionOptions(BaseModel):
language: Optional[str] = w_config.get('language')
task: str = w_config.get('task', 'transcribe')
beam_size: int = w_config.get('beam_size', 5)
best_of: int = w_config.get('best_of', 5)
patience: float = w_config.get('patience', 1.0)
length_penalty: float = w_config.get('length_penalty', 1.0)
repetition_penalty: float = w_config.get('repetition_penalty', 1.0)
no_repeat_ngram_size: int = w_config.get('no_repeat_ngram_size', 0)
temperature: Union[float, List[float], Tuple[float, ...]] = w_config.get('temperature', [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
log_progress: bool = w_config.get('log_progress', False)
compression_ratio_threshold: Optional[float] = w_config.get('compression_ratio_threshold', 2.4)
log_prob_threshold: Optional[float] = w_config.get('log_prob_threshold', -1.0)
no_speech_threshold: Optional[float] = w_config.get('no_speech_threshold', 0.6)
condition_on_previous_text: bool = w_config.get('condition_on_previous_text', True)
prompt_reset_on_temperature: float = w_config.get('prompt_reset_on_temperature', 0.5)
initial_prompt: Optional[Union[str, Iterable[int]]] = w_config.get('initial_prompt')
prefix: Optional[str] = w_config.get('prefix')
suppress_blank: bool = w_config.get('suppress_blank', True)
suppress_tokens: Optional[List[int]] = w_config.get('suppress_tokens', [-1])
without_timestamps: bool = w_config.get('without_timestamps', False)
max_initial_timestamp: float = w_config.get('max_initial_timestamp', 1.0)
word_timestamps: bool = w_config.get('word_timestamps', False)
prepend_punctuations: str = w_config.get('prepend_punctuations', '"\'“¿([{-')
append_punctuations: str = w_config.get('append_punctuations', '"\'.。,!?::”)]}、')
vad_filter: bool = w_config.get('vad_filter', False)
vad_parameters: Optional[dict] = w_config.get('vad_parameters')
max_new_tokens: Optional[int] = w_config.get('max_new_tokens')
chunk_length: Optional[int] = w_config.get('chunk_length')
clip_timestamps: Union[str, List[float]] = w_config.get('clip_timestamps', "0")
hallucination_silence_threshold: Optional[float] = w_config.get('hallucination_silence_threshold')
hotwords: Optional[str] = w_config.get('hotwords')
language_detection_threshold: Optional[float] = w_config.get('language_detection_threshold')
language_detection_segments: int = w_config.get('language_detection_segments', 1)
class WhisperTranscriber:
def __init__(self, model_name, device, compute_type):
self.model = WhisperModel(model_name, device=device, compute_type=compute_type)
def transcribe(self, audio_file_path: str, options: dict) -> str:
segments, _ = self.model.transcribe(audio_file_path, **options)
transcription = " ".join([segment.text for segment in segments])
return transcription
transcriber = WhisperTranscriber(
model_name=w_config['model_name'],
device=w_config['device'],
compute_type=w_config['compute_type']
)
@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...), options: TranscriptionOptions = Depends()):
temp_audio_file_path = None
converted_file_path = None
was_converted = False
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio_file:
temp_audio_file.write(await file.read())
temp_audio_file_path = temp_audio_file.name
converted_file_path, was_converted = convert_to_wav(temp_audio_file_path)
transcription = transcriber.transcribe(converted_file_path, options.dict(exclude_none=True))
return {"transcription": transcription}
finally:
if temp_audio_file_path and os.path.exists(temp_audio_file_path):
os.remove(temp_audio_file_path)
if was_converted and converted_file_path and os.path.exists(converted_file_path):
os.remove(converted_file_path)
def create_ui():
return '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Whisper Transcription</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<style>
body {
background-color: #f8f9fa;
}
.container {
max-width: 700px;
}
#transcriptionOutput {
white-space: pre-wrap;
word-wrap: break-word;
}
</style>
</head>
<body>
<div class="container mt-5">
<div class="card">
<div class="card-body">
<h1 class="card-title text-center mb-4">Upload Audio for Transcription</h1>
<div class="mb-3">
<input class="form-control" type="file" id="audioFile" accept="audio/*">
</div>
<div class="d-grid">
<button class="btn btn-primary" onclick="transcribeAudio()">
<span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true" id="spinner"></span>
Transcribe
</button>
</div>
<h2 class="mt-4">Transcription:</h2>
<div class="p-3 bg-light rounded">
<pre id="transcriptionOutput"></pre>
</div>
</div>
</div>
</div>
<script>
async function transcribeAudio() {
const fileInput = document.getElementById('audioFile');
const file = fileInput.files[0];
if (!file) {
alert("Please select a file first.");
return;
}
const formData = new FormData();
formData.append('file', file);
const outputElement = document.getElementById('transcriptionOutput');
const spinner = document.getElementById('spinner');
const transcribeButton = document.querySelector('button');
outputElement.innerText = '';
spinner.classList.remove('d-none');
transcribeButton.disabled = true;
try {
const response = await fetch('/transcribe', {
method: 'POST',
body: formData
});
if (response.ok) {
const result = await response.json();
if (result.transcription) {
outputElement.innerText = result.transcription;
} else if (result.error) {
outputElement.innerText = 'Error: ' + result.error;
}
} else {
const errorText = await response.text();
outputElement.innerText = 'Error: ' + response.statusText + ' - ' + errorText;
}
} catch (error) {
outputElement.innerText = 'An error occurred: ' + error;
} finally {
spinner.classList.add('d-none');
transcribeButton.disabled = false;
}
}
</script>
</body>
</html>
'''
if __name__ == "__main__":
import uvicorn
s_config = config['server']
if s_config['ui'] or "--ui" in sys.argv:
@app.get("/", response_class=HTMLResponse)
async def read_root():
return create_ui()
uvicorn.run(
app,
host=s_config['host'],
port=s_config['port']
)

View File

@@ -1,6 +1,6 @@
fastapi
uvicorn[standard]
python-multipart
faster-whisper
openai-whisper
ffmpeg-python
PyYAML