feat: add language-aware sentence tokenization (#1269)

* feat: add language-aware sentence tokenization * feat: add missing punkt languages --------- Co-authored-by: pulkit <129310466+p1kit@users.noreply.github.com> Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
2025-10-21 19:27:26 +05:30
parent 6e1d1caaf4
commit db317c358b
2 changed files with 28 additions and 3 deletions
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -14,7 +14,7 @@ import torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

 from whisperx.audio import SAMPLE_RATE, load_audio
-from whisperx.utils import interpolate_nans
+from whisperx.utils import interpolate_nans, PUNKT_LANGUAGES
 from whisperx.schema import (
    AlignedTranscriptionResult,
    SingleSegment,
@@ -192,11 +192,13 @@ def align(
                clean_wdx.append(wdx)


+        # Use language-specific Punkt model if available otherwise we fallback to English.
+        punkt_lang = PUNKT_LANGUAGES.get(model_lang, 'english')
        try:
-            sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
+            sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
        except LookupError:
            nltk.download('punkt_tab', quiet=True)
-            sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
+            sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
        sentence_spans = list(sentence_splitter.span_tokenize(text))

        segment_data[sdx] = {
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@@ -126,6 +126,29 @@ TO_LANGUAGE_CODE = {

 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]

+# Mapping of language codes to NLTK Punkt tokenizer model names
+PUNKT_LANGUAGES = {
+    'cs': 'czech',
+    'da': 'danish', 
+    'de': 'german',
+    'el': 'greek',
+    'en': 'english',
+    'es': 'spanish',
+    'et': 'estonian',
+    'fi': 'finnish',
+    'fr': 'french',
+    'it': 'italian',
+    'nl': 'dutch',
+    'no': 'norwegian',
+    'pl': 'polish',
+    'pt': 'portuguese',
+    'sl': 'slovene',
+    'sv': 'swedish',
+    'tr': 'turkish',
+    "ml": "malayalam"
+    "ru": "russian",
+}
+
 system_encoding = sys.getdefaultencoding()

 if system_encoding != "utf-8":