feat: add language-aware sentence tokenization (#1269)

* feat: add language-aware sentence tokenization

* feat: add missing punkt languages

---------

Co-authored-by: pulkit <129310466+p1kit@users.noreply.github.com>
Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
This commit is contained in:
pplkit
2025-10-21 19:27:26 +05:30
committed by GitHub
parent 6e1d1caaf4
commit db317c358b
2 changed files with 28 additions and 3 deletions

View File

@@ -14,7 +14,7 @@ import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from whisperx.audio import SAMPLE_RATE, load_audio
from whisperx.utils import interpolate_nans
from whisperx.utils import interpolate_nans, PUNKT_LANGUAGES
from whisperx.schema import (
AlignedTranscriptionResult,
SingleSegment,
@@ -192,11 +192,13 @@ def align(
clean_wdx.append(wdx)
# Use language-specific Punkt model if available otherwise we fallback to English.
punkt_lang = PUNKT_LANGUAGES.get(model_lang, 'english')
try:
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
except LookupError:
nltk.download('punkt_tab', quiet=True)
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
sentence_spans = list(sentence_splitter.span_tokenize(text))
segment_data[sdx] = {

View File

@@ -126,6 +126,29 @@ TO_LANGUAGE_CODE = {
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
# Mapping of language codes to NLTK Punkt tokenizer model names
PUNKT_LANGUAGES = {
'cs': 'czech',
'da': 'danish',
'de': 'german',
'el': 'greek',
'en': 'english',
'es': 'spanish',
'et': 'estonian',
'fi': 'finnish',
'fr': 'french',
'it': 'italian',
'nl': 'dutch',
'no': 'norwegian',
'pl': 'polish',
'pt': 'portuguese',
'sl': 'slovene',
'sv': 'swedish',
'tr': 'turkish',
"ml": "malayalam"
"ru": "russian",
}
system_encoding = sys.getdefaultencoding()
if system_encoding != "utf-8":