created

2025-07-11 19:17:33 +03:00
parent 303b7b7584
commit e5fd44e3c3
6 changed files with 221 additions and 253 deletions
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
 # Используем образ ROCm с предустановленным PyTorch
 FROM rocm/pytorch:rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0
 # Устанавливаем рабочую директорию в контейнере
 WORKDIR /app
 # Устанавливаем системные зависимости
 RUN apt-get update && apt-get install -y \
    ffmpeg \
    python3-pip \
    && rm -rf /var/lib/apt/lists/*
 # Устанавливаем зависимости Python
 COPY requirements.txt .
 RUN pip install --no-cache-dir --default-timeout=100 -r requirements.txt
 # Копируем остальные файлы приложения
 COPY . .
 # Открываем порт, на котором будет работать приложение
 EXPOSE 9854
 # Устанавливаем переменные окружения для ROCm
 ENV HSA_OVERRIDE_GFX_VERSION=10.3.0
 ENV PYTORCH_ROCM_ARCH=gfx1030
 # Команда для запуска приложения
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "9854", "--log-level", "debug"]
--- a/app.py
+++ b/app.py
@@ -0,0 +1,192 @@
 import logging
 import os
 import subprocess
 import time
 from typing import Dict
 from typing import Optional, Union, List, Tuple
 import whisper
 from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
 from fastapi.security import APIKeyHeader
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 app = FastAPI()
 # API key header
 api_key_header = APIKeyHeader(name="x-api-key")
 def get_keys(): # не бейте меня за это
    keys_file = "keys.txt"
    if not os.path.exists(keys_file):
        # Create a new keys file with a default key
        default_key = os.urandom(32).hex()
        with open(keys_file, "w") as f:
            f.write(default_key + "\n")
        logger.info(f"Created new keys file with default key: {default_key}")
        return [default_key]
    else:
        # Read keys from the existing file
        with open(keys_file, "r") as f:
            keys = [line.strip() for line in f if line.strip()]
        logger.info(f"Loaded {len(keys)} keys from file")
        logger.debug(f"Keys: {keys}")
        if not keys:
            raise ValueError("No keys found in keys.txt")
        return keys
 def convert_audio(input_path: str, output_path: str, speed: float = 1.25):
    """
    Convert audio to compatible format and speed up
    """
    try:
        command = [
            'ffmpeg', '-i', input_path,
            '-filter:a', f'atempo={speed}',
            '-ar', '16000',
            '-ac', '1',
            '-c:a', 'pcm_s16le',
            output_path,
            '-y'
        ]
        logger.debug(f"Running FFmpeg command: {' '.join(command)}")
        subprocess.run(command, check=True, capture_output=True)
        return True
    except subprocess.CalledProcessError as e:
        logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
        return False
 class TranscriptionMetrics:
    def __init__(self):
        self.start_time = time.time()
        self.end_time = None
        self.text_length = 0
        self.audio_duration = 0
    def stop(self, text: str, audio_duration: float):
        self.end_time = time.time()
        self.text_length = len(text)
        self.audio_duration = audio_duration
    def get_metrics(self) -> Dict[str, float]:
        processing_time = self.end_time - self.start_time
        return {
            "processing_time_seconds": round(processing_time, 2),
            "characters_per_second": round(self.text_length / processing_time, 2),
            "audio_realtime_ratio": round(self.audio_duration / processing_time, 2),
            "audio_duration": round(self.audio_duration, 2),
            "text_length": self.text_length
        }
 def get_audio_duration(file_path: str) -> float:
    """Get audio duration using ffprobe"""
    cmd = [
        'ffprobe',
        '-v', 'quiet',
        '-show_entries', 'format=duration',
        '-of', 'default=noprint_wrappers=1:nokey=1',
        file_path
    ]
    try:
        output = subprocess.check_output(cmd).decode().strip()
        return float(output)
    except:
        return 0.0
@app.post("/transcribe")
 async def transcribe_audio(
    file: UploadFile = File(...),
    token: str = Depends(api_key_header),
    model_name: str = "medium",
    verbose: Optional[bool] = None,
    temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    compression_ratio_threshold: Optional[float] = 2.4,
    logprob_threshold: Optional[float] = -1.0,
    no_speech_threshold: Optional[float] = 0.6,
    condition_on_previous_text: bool = True,
    initial_prompt: Optional[str] = None,
    word_timestamps: bool = False,
    prepend_punctuations: str = "\"'\"¿([{-",
    append_punctuations: str = "\"\'.。,，!！?？:：\")]}、",
    clip_timestamps: Union[str, List[float]] = "0",
    hallucination_silence_threshold: Optional[float] = None
 ):
    # Token validation
    if token not in get_keys():
        logger.warning(f"Invalid token attempt: {token}")
        raise HTTPException(status_code=403, detail="Forbidden")
    logger.info(f"Processing file: {file.filename} with model: {model_name}")
    metrics = TranscriptionMetrics()
    # Save uploaded file
    temp_input_path = f"/tmp/input_{file.filename}"
    temp_output_path = f"/tmp/converted_{file.filename}.wav"
    try:
        with open(temp_input_path, "wb") as f:
            f.write(await file.read())
        # Convert audio if needed
        logger.debug("Converting audio file")
        if not convert_audio(temp_input_path, temp_output_path):
            raise HTTPException(status_code=400, detail="Audio conversion failed")
        # Get audio duration before speed up
        original_duration = get_audio_duration(temp_input_path)
        # Load model
        logger.debug(f"Loading model: {model_name}")
        model = whisper.load_model(model_name, device="cuda")
        # Transcribe
        logger.info("Starting transcription")
        result = model.transcribe(
            temp_output_path,
            verbose=verbose,
            temperature=temperature,
            compression_ratio_threshold=compression_ratio_threshold,
            logprob_threshold=logprob_threshold,
            no_speech_threshold=no_speech_threshold,
            condition_on_previous_text=condition_on_previous_text,
            initial_prompt=initial_prompt,
            word_timestamps=word_timestamps,
            prepend_punctuations=prepend_punctuations,
            append_punctuations=append_punctuations,
            clip_timestamps=clip_timestamps,
            hallucination_silence_threshold=hallucination_silence_threshold
        )
        # Calculate metrics
        metrics.stop(result["text"], original_duration)
        logger.info(f"Transcription metrics: {metrics.get_metrics()}")
        # Add metrics to result
        result["metrics"] = metrics.get_metrics()
        return result
    except Exception as e:
        logger.error(f"Transcription failed: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        # Cleanup temporary files
        if os.path.exists(temp_input_path):
            os.remove(temp_input_path)
        if os.path.exists(temp_output_path):
            os.remove(temp_output_path)
 def main():
    import uvicorn
    get_keys()
    uvicorn.run(app, host="0.0.0.0")
 if __name__ == "__main__":
    main()
--- a/config.yaml
+++ b/config.yaml
@@ -1,9 +0,0 @@
 server:
  host: "0.0.0.0"
  port: 8000
  ui: true
 whisper:
  model_name: "turbo"
  device: "cuda"
  compute_type: "int8"
--- a/converter.py
+++ b/converter.py
@@ -1,45 +0,0 @@
 import ffmpeg
 import os
 import tempfile
 import shutil
 def is_valid_format(file_path: str) -> bool:
    """Проверяет, является ли аудиофайл 16kHz моно WAV."""
    try:
        probe = ffmpeg.probe(file_path)
        audio_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
        if audio_stream is None:
            return False
        return (
            audio_stream.get('codec_name') == 'pcm_s16le' and
            audio_stream.get('channels') == 1 and
            audio_stream.get('sample_rate') == '16000'
        )
    except ffmpeg.Error:
        return False
 def convert_to_wav(input_file_path: str) -> tuple[str, bool]:
    """
    Конвертирует аудиофайл в 16kHz моно WAV.
    Возвращает путь к сконвертированному файлу и флаг, указывающий, была ли выполнена конвертация.
    Если файл уже в нужном формате, возвращает исходный путь и False.
    """
    if is_valid_format(input_file_path):
        return input_file_path, False
    output_file_path = tempfile.mktemp(suffix=".wav")
    try:
        ffmpeg.input(input_file_path).output(
            output_file_path,
            acodec='pcm_s16le',
            ac=1,
            ar='16k'
        ).run(capture_stdout=True, capture_stderr=True)
        return output_file_path, True
    except ffmpeg.Error as e:
        if os.path.exists(output_file_path):
            os.remove(output_file_path)
        raise e
--- a/main.py
+++ b/main.py
@@ -1,198 +0,0 @@
 import os
 import tempfile
 import sys
 import yaml
 from typing import Optional, List, Union, Tuple, Iterable
 from fastapi import FastAPI, UploadFile, File, Depends
 from pydantic import BaseModel
 from fastapi.responses import HTMLResponse
 from faster_whisper import WhisperModel
 from converter import convert_to_wav
 with open("config.yaml", 'r') as f:
    config = yaml.safe_load(f)
 app = FastAPI()
 w_config = config['whisper']
 class TranscriptionOptions(BaseModel):
    language: Optional[str] = w_config.get('language')
    task: str = w_config.get('task', 'transcribe')
    beam_size: int = w_config.get('beam_size', 5)
    best_of: int = w_config.get('best_of', 5)
    patience: float = w_config.get('patience', 1.0)
    length_penalty: float = w_config.get('length_penalty', 1.0)
    repetition_penalty: float = w_config.get('repetition_penalty', 1.0)
    no_repeat_ngram_size: int = w_config.get('no_repeat_ngram_size', 0)
    temperature: Union[float, List[float], Tuple[float, ...]] = w_config.get('temperature', [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
    log_progress: bool = w_config.get('log_progress', False)
    compression_ratio_threshold: Optional[float] = w_config.get('compression_ratio_threshold', 2.4)
    log_prob_threshold: Optional[float] = w_config.get('log_prob_threshold', -1.0)
    no_speech_threshold: Optional[float] = w_config.get('no_speech_threshold', 0.6)
    condition_on_previous_text: bool = w_config.get('condition_on_previous_text', True)
    prompt_reset_on_temperature: float = w_config.get('prompt_reset_on_temperature', 0.5)
    initial_prompt: Optional[Union[str, Iterable[int]]] = w_config.get('initial_prompt')
    prefix: Optional[str] = w_config.get('prefix')
    suppress_blank: bool = w_config.get('suppress_blank', True)
    suppress_tokens: Optional[List[int]] = w_config.get('suppress_tokens', [-1])
    without_timestamps: bool = w_config.get('without_timestamps', False)
    max_initial_timestamp: float = w_config.get('max_initial_timestamp', 1.0)
    word_timestamps: bool = w_config.get('word_timestamps', False)
    prepend_punctuations: str = w_config.get('prepend_punctuations', '"\'“¿([{-')
    append_punctuations: str = w_config.get('append_punctuations', '"\'.。,，!！?？:：”)]}、')
    vad_filter: bool = w_config.get('vad_filter', False)
    vad_parameters: Optional[dict] = w_config.get('vad_parameters')
    max_new_tokens: Optional[int] = w_config.get('max_new_tokens')
    chunk_length: Optional[int] = w_config.get('chunk_length')
    clip_timestamps: Union[str, List[float]] = w_config.get('clip_timestamps', "0")
    hallucination_silence_threshold: Optional[float] = w_config.get('hallucination_silence_threshold')
    hotwords: Optional[str] = w_config.get('hotwords')
    language_detection_threshold: Optional[float] = w_config.get('language_detection_threshold')
    language_detection_segments: int = w_config.get('language_detection_segments', 1)
 class WhisperTranscriber:
    def __init__(self, model_name, device, compute_type):
        self.model = WhisperModel(model_name, device=device, compute_type=compute_type)
    def transcribe(self, audio_file_path: str, options: dict) -> str:
        segments, _ = self.model.transcribe(audio_file_path, **options)
        transcription = " ".join([segment.text for segment in segments])
        return transcription
 transcriber = WhisperTranscriber(
    model_name=w_config['model_name'],
    device=w_config['device'],
    compute_type=w_config['compute_type']
 )
@app.post("/transcribe")
 async def transcribe_audio(file: UploadFile = File(...), options: TranscriptionOptions = Depends()):
    temp_audio_file_path = None
    converted_file_path = None
    was_converted = False
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio_file:
            temp_audio_file.write(await file.read())
            temp_audio_file_path = temp_audio_file.name
        converted_file_path, was_converted = convert_to_wav(temp_audio_file_path)
        transcription = transcriber.transcribe(converted_file_path, options.dict(exclude_none=True))
        return {"transcription": transcription}
    finally:
        if temp_audio_file_path and os.path.exists(temp_audio_file_path):
            os.remove(temp_audio_file_path)
        if was_converted and converted_file_path and os.path.exists(converted_file_path):
            os.remove(converted_file_path)
 def create_ui():
    return '''
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Whisper Transcription</title>
        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
        <style>
            body {
                background-color: #f8f9fa;
            }
            .container {
                max-width: 700px;
            }
            #transcriptionOutput {
                white-space: pre-wrap;
                word-wrap: break-word;
            }
        </style>
    </head>
    <body>
        <div class="container mt-5">
            <div class="card">
                <div class="card-body">
                    <h1 class="card-title text-center mb-4">Upload Audio for Transcription</h1>
                    <div class="mb-3">
                        <input class="form-control" type="file" id="audioFile" accept="audio/*">
                    </div>
                    <div class="d-grid">
                        <button class="btn btn-primary" onclick="transcribeAudio()">
                            <span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true" id="spinner"></span>
                            Transcribe
                        </button>
                    </div>
                    <h2 class="mt-4">Transcription:</h2>
                    <div class="p-3 bg-light rounded">
                        <pre id="transcriptionOutput"></pre>
                    </div>
                </div>
            </div>
        </div>
        <script>
            async function transcribeAudio() {
                const fileInput = document.getElementById('audioFile');
                const file = fileInput.files[0];
                if (!file) {
                    alert("Please select a file first.");
                    return;
                }
                const formData = new FormData();
                formData.append('file', file);
                const outputElement = document.getElementById('transcriptionOutput');
                const spinner = document.getElementById('spinner');
                const transcribeButton = document.querySelector('button');
                outputElement.innerText = '';
                spinner.classList.remove('d-none');
                transcribeButton.disabled = true;
                try {
                    const response = await fetch('/transcribe', {
                        method: 'POST',
                        body: formData
                    });
                    if (response.ok) {
                        const result = await response.json();
                        if (result.transcription) {
                            outputElement.innerText = result.transcription;
                        } else if (result.error) {
                            outputElement.innerText = 'Error: ' + result.error;
                        }
                    } else {
                        const errorText = await response.text();
                        outputElement.innerText = 'Error: ' + response.statusText + ' - ' + errorText;
                    }
                } catch (error) {
                    outputElement.innerText = 'An error occurred: ' + error;
                } finally {
                    spinner.classList.add('d-none');
                    transcribeButton.disabled = false;
                }
            }
        </script>
    </body>
    </html>
    '''
 if __name__ == "__main__":
    import uvicorn
    s_config = config['server']
    if s_config['ui'] or "--ui" in sys.argv:
        @app.get("/", response_class=HTMLResponse)
        async def read_root():
            return create_ui()
    uvicorn.run(
        app,
        host=s_config['host'],
        port=s_config['port']
    )
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 fastapi
 uvicorn[standard]
 python-multipart
-faster-whisper
+openai-whisper
 ffmpeg-python
 PyYAML