micro server with small ui on 8000 for running faster_whisper
This commit is contained in:
5
.idea/.gitignore
generated
vendored
Normal file
5
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
9
config.yaml
Normal file
9
config.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
server:
|
||||||
|
host: "0.0.0.0"
|
||||||
|
port: 8000
|
||||||
|
ui: true
|
||||||
|
|
||||||
|
whisper:
|
||||||
|
model_name: "turbo"
|
||||||
|
device: "cuda"
|
||||||
|
compute_type: "int8"
|
||||||
45
converter.py
Normal file
45
converter.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
import ffmpeg
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
def is_valid_format(file_path: str) -> bool:
|
||||||
|
"""Проверяет, является ли аудиофайл 16kHz моно WAV."""
|
||||||
|
try:
|
||||||
|
probe = ffmpeg.probe(file_path)
|
||||||
|
audio_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None)
|
||||||
|
if audio_stream is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return (
|
||||||
|
audio_stream.get('codec_name') == 'pcm_s16le' and
|
||||||
|
audio_stream.get('channels') == 1 and
|
||||||
|
audio_stream.get('sample_rate') == '16000'
|
||||||
|
)
|
||||||
|
except ffmpeg.Error:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert_to_wav(input_file_path: str) -> tuple[str, bool]:
|
||||||
|
"""
|
||||||
|
Конвертирует аудиофайл в 16kHz моно WAV.
|
||||||
|
Возвращает путь к сконвертированному файлу и флаг, указывающий, была ли выполнена конвертация.
|
||||||
|
Если файл уже в нужном формате, возвращает исходный путь и False.
|
||||||
|
"""
|
||||||
|
if is_valid_format(input_file_path):
|
||||||
|
return input_file_path, False
|
||||||
|
|
||||||
|
output_file_path = tempfile.mktemp(suffix=".wav")
|
||||||
|
|
||||||
|
try:
|
||||||
|
ffmpeg.input(input_file_path).output(
|
||||||
|
output_file_path,
|
||||||
|
acodec='pcm_s16le',
|
||||||
|
ac=1,
|
||||||
|
ar='16k'
|
||||||
|
).run(capture_stdout=True, capture_stderr=True)
|
||||||
|
return output_file_path, True
|
||||||
|
except ffmpeg.Error as e:
|
||||||
|
if os.path.exists(output_file_path):
|
||||||
|
os.remove(output_file_path)
|
||||||
|
raise e
|
||||||
|
|
||||||
198
main.py
Normal file
198
main.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import sys
|
||||||
|
import yaml
|
||||||
|
from typing import Optional, List, Union, Tuple, Iterable
|
||||||
|
from fastapi import FastAPI, UploadFile, File, Depends
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
from converter import convert_to_wav
|
||||||
|
|
||||||
|
with open("config.yaml", 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
w_config = config['whisper']
|
||||||
|
|
||||||
|
class TranscriptionOptions(BaseModel):
|
||||||
|
language: Optional[str] = w_config.get('language')
|
||||||
|
task: str = w_config.get('task', 'transcribe')
|
||||||
|
beam_size: int = w_config.get('beam_size', 5)
|
||||||
|
best_of: int = w_config.get('best_of', 5)
|
||||||
|
patience: float = w_config.get('patience', 1.0)
|
||||||
|
length_penalty: float = w_config.get('length_penalty', 1.0)
|
||||||
|
repetition_penalty: float = w_config.get('repetition_penalty', 1.0)
|
||||||
|
no_repeat_ngram_size: int = w_config.get('no_repeat_ngram_size', 0)
|
||||||
|
temperature: Union[float, List[float], Tuple[float, ...]] = w_config.get('temperature', [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
|
||||||
|
log_progress: bool = w_config.get('log_progress', False)
|
||||||
|
compression_ratio_threshold: Optional[float] = w_config.get('compression_ratio_threshold', 2.4)
|
||||||
|
log_prob_threshold: Optional[float] = w_config.get('log_prob_threshold', -1.0)
|
||||||
|
no_speech_threshold: Optional[float] = w_config.get('no_speech_threshold', 0.6)
|
||||||
|
condition_on_previous_text: bool = w_config.get('condition_on_previous_text', True)
|
||||||
|
prompt_reset_on_temperature: float = w_config.get('prompt_reset_on_temperature', 0.5)
|
||||||
|
initial_prompt: Optional[Union[str, Iterable[int]]] = w_config.get('initial_prompt')
|
||||||
|
prefix: Optional[str] = w_config.get('prefix')
|
||||||
|
suppress_blank: bool = w_config.get('suppress_blank', True)
|
||||||
|
suppress_tokens: Optional[List[int]] = w_config.get('suppress_tokens', [-1])
|
||||||
|
without_timestamps: bool = w_config.get('without_timestamps', False)
|
||||||
|
max_initial_timestamp: float = w_config.get('max_initial_timestamp', 1.0)
|
||||||
|
word_timestamps: bool = w_config.get('word_timestamps', False)
|
||||||
|
prepend_punctuations: str = w_config.get('prepend_punctuations', '"\'“¿([{-')
|
||||||
|
append_punctuations: str = w_config.get('append_punctuations', '"\'.。,,!!??::”)]}、')
|
||||||
|
vad_filter: bool = w_config.get('vad_filter', False)
|
||||||
|
vad_parameters: Optional[dict] = w_config.get('vad_parameters')
|
||||||
|
max_new_tokens: Optional[int] = w_config.get('max_new_tokens')
|
||||||
|
chunk_length: Optional[int] = w_config.get('chunk_length')
|
||||||
|
clip_timestamps: Union[str, List[float]] = w_config.get('clip_timestamps', "0")
|
||||||
|
hallucination_silence_threshold: Optional[float] = w_config.get('hallucination_silence_threshold')
|
||||||
|
hotwords: Optional[str] = w_config.get('hotwords')
|
||||||
|
language_detection_threshold: Optional[float] = w_config.get('language_detection_threshold')
|
||||||
|
language_detection_segments: int = w_config.get('language_detection_segments', 1)
|
||||||
|
|
||||||
|
class WhisperTranscriber:
|
||||||
|
def __init__(self, model_name, device, compute_type):
|
||||||
|
self.model = WhisperModel(model_name, device=device, compute_type=compute_type)
|
||||||
|
|
||||||
|
def transcribe(self, audio_file_path: str, options: dict) -> str:
|
||||||
|
segments, _ = self.model.transcribe(audio_file_path, **options)
|
||||||
|
transcription = " ".join([segment.text for segment in segments])
|
||||||
|
return transcription
|
||||||
|
|
||||||
|
transcriber = WhisperTranscriber(
|
||||||
|
model_name=w_config['model_name'],
|
||||||
|
device=w_config['device'],
|
||||||
|
compute_type=w_config['compute_type']
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.post("/transcribe")
|
||||||
|
async def transcribe_audio(file: UploadFile = File(...), options: TranscriptionOptions = Depends()):
|
||||||
|
temp_audio_file_path = None
|
||||||
|
converted_file_path = None
|
||||||
|
was_converted = False
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_audio_file:
|
||||||
|
temp_audio_file.write(await file.read())
|
||||||
|
temp_audio_file_path = temp_audio_file.name
|
||||||
|
|
||||||
|
converted_file_path, was_converted = convert_to_wav(temp_audio_file_path)
|
||||||
|
|
||||||
|
transcription = transcriber.transcribe(converted_file_path, options.dict(exclude_none=True))
|
||||||
|
|
||||||
|
return {"transcription": transcription}
|
||||||
|
finally:
|
||||||
|
if temp_audio_file_path and os.path.exists(temp_audio_file_path):
|
||||||
|
os.remove(temp_audio_file_path)
|
||||||
|
if was_converted and converted_file_path and os.path.exists(converted_file_path):
|
||||||
|
os.remove(converted_file_path)
|
||||||
|
|
||||||
|
def create_ui():
|
||||||
|
return '''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Whisper Transcription</title>
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
background-color: #f8f9fa;
|
||||||
|
}
|
||||||
|
.container {
|
||||||
|
max-width: 700px;
|
||||||
|
}
|
||||||
|
#transcriptionOutput {
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-wrap: break-word;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container mt-5">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-body">
|
||||||
|
<h1 class="card-title text-center mb-4">Upload Audio for Transcription</h1>
|
||||||
|
<div class="mb-3">
|
||||||
|
<input class="form-control" type="file" id="audioFile" accept="audio/*">
|
||||||
|
</div>
|
||||||
|
<div class="d-grid">
|
||||||
|
<button class="btn btn-primary" onclick="transcribeAudio()">
|
||||||
|
<span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true" id="spinner"></span>
|
||||||
|
Transcribe
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<h2 class="mt-4">Transcription:</h2>
|
||||||
|
<div class="p-3 bg-light rounded">
|
||||||
|
<pre id="transcriptionOutput"></pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
async function transcribeAudio() {
|
||||||
|
const fileInput = document.getElementById('audioFile');
|
||||||
|
const file = fileInput.files[0];
|
||||||
|
if (!file) {
|
||||||
|
alert("Please select a file first.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('file', file);
|
||||||
|
|
||||||
|
const outputElement = document.getElementById('transcriptionOutput');
|
||||||
|
const spinner = document.getElementById('spinner');
|
||||||
|
const transcribeButton = document.querySelector('button');
|
||||||
|
|
||||||
|
outputElement.innerText = '';
|
||||||
|
spinner.classList.remove('d-none');
|
||||||
|
transcribeButton.disabled = true;
|
||||||
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/transcribe', {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const result = await response.json();
|
||||||
|
if (result.transcription) {
|
||||||
|
outputElement.innerText = result.transcription;
|
||||||
|
} else if (result.error) {
|
||||||
|
outputElement.innerText = 'Error: ' + result.error;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const errorText = await response.text();
|
||||||
|
outputElement.innerText = 'Error: ' + response.statusText + ' - ' + errorText;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
outputElement.innerText = 'An error occurred: ' + error;
|
||||||
|
} finally {
|
||||||
|
spinner.classList.add('d-none');
|
||||||
|
transcribeButton.disabled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
s_config = config['server']
|
||||||
|
|
||||||
|
if s_config['ui'] or "--ui" in sys.argv:
|
||||||
|
@app.get("/", response_class=HTMLResponse)
|
||||||
|
async def read_root():
|
||||||
|
return create_ui()
|
||||||
|
|
||||||
|
uvicorn.run(
|
||||||
|
app,
|
||||||
|
host=s_config['host'],
|
||||||
|
port=s_config['port']
|
||||||
|
)
|
||||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
fastapi
|
||||||
|
uvicorn[standard]
|
||||||
|
python-multipart
|
||||||
|
faster-whisper
|
||||||
|
ffmpeg-python
|
||||||
|
PyYAML
|
||||||
Reference in New Issue
Block a user