Merge pull request #1 from SlavaVlad/Whisper-Based

Whisper based is main now
2025-08-17 23:28:16 +09:00
parent f718da13d6 6ce92c130d
commit 4fd0f18dd1
1 changed files with 20 additions and 38 deletions
--- a/app.py
+++ b/app.py
@@ -18,8 +18,6 @@ logger = logging.getLogger(__name__)

 app = FastAPI()

-model = gigaam.load_model("v2_ctc", device=getenv("ASR_DEVICE"), download_root=getenv("ASR_MODELS_ROOT"))
-
 # API key header
 api_key_header = APIKeyHeader(name="x-api-key")

@@ -44,9 +42,9 @@ def get_keys():  # не бейте меня за это
        return keys


-def convert_audio(input_path: str, output_path: str, speed: float = 1.25):
+def convert_audio(input_path: str, output_path: str, speed: float = 1.0):
    """
-    Convert audio to compatible format and speed up
+    Convert audio to compatible format and speed up if needed.
    """
    try:
        command = [
@@ -66,29 +64,6 @@ def convert_audio(input_path: str, output_path: str, speed: float = 1.25):
        return False


-class TranscriptionMetrics:
-    def __init__(self):
-        self.start_time = time.time()
-        self.end_time = None
-        self.text_length = 0
-        self.audio_duration = 0
-
-    def stop(self, text: str, audio_duration: float):
-        self.end_time = time.time()
-        self.text_length = len(text)
-        self.audio_duration = audio_duration
-
-    def get_metrics(self) -> Dict[str, float]:
-        processing_time = self.end_time - self.start_time
-        return {
-            "processing_time_seconds": round(processing_time, 2),
-            "characters_per_second": round(self.text_length / processing_time, 2),
-            "audio_realtime_ratio": round(self.audio_duration / processing_time, 2),
-            "audio_duration": round(self.audio_duration, 2),
-            "text_length": self.text_length
-        }
-
-
 def get_audio_duration(file_path: str) -> float:
    """Get audio duration using ffprobe"""
    cmd = [
@@ -109,15 +84,29 @@ def get_audio_duration(file_path: str) -> float:
 async def transcribe_audio(
        file: UploadFile = File(...),
        token: str = Depends(api_key_header),
-        model_name: str = "turbo"
+        model: str = "turbo",
+        verbose: Optional[bool] = None,
+        temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+        compression_ratio_threshold: Optional[float] = 2.4,
+        speed_up: Optional[float] = 1.25,
+        logprob_threshold: Optional[float] = -1.0,
+        no_speech_threshold: Optional[float] = 0.6,
+        condition_on_previous_text: bool = True,
+        initial_prompt: Optional[str] = None,
+        word_timestamps: bool = False,
+        prepend_punctuations: str = "\"'\"¿([{-",
+        append_punctuations: str = "\"\'.。,，!！?？:：\")]}、",
+        clip_timestamps: Union[str, List[float]] = "0",
+        hallucination_silence_threshold: Optional[float] = None
 ):
    # Token validation
    if token not in get_keys():
        logger.warning(f"Invalid token attempt: {token}")
        raise HTTPException(status_code=403, detail="Forbidden")

-    logger.info(f"Processing file: {file.filename} with model: {model_name}")
-    metrics = TranscriptionMetrics()
+    model = whisper.load_model(model)  # Load the Whisper model
+
+    logger.info(f"Processing file: {file.filename} with model: {model}")

    # Save uploaded file
    temp_input_path = f"/tmp/input_{file.filename}"
@@ -129,7 +118,7 @@ async def transcribe_audio(

        # Convert audio if needed
        logger.debug("Converting audio file")
-        if not convert_audio(temp_input_path, temp_output_path):
+        if not convert_audio(temp_input_path, temp_output_path, speed_up):
            raise HTTPException(status_code=400, detail="Audio conversion failed")

        # Get audio duration before speed up
@@ -158,13 +147,6 @@ async def transcribe_audio(
            "text": full_text
        }

-        # Calculate metrics
-        metrics.stop(full_text, original_duration)
-        logger.info(f"Transcription metrics: {metrics.get_metrics()}")
-
-        # Add metrics to result
-        result["metrics"] = metrics.get_metrics()
-
        return result

    except Exception as e: