feat: update Punkt tokenizer to use pre-trained model and handle missing data

This commit is contained in:
Barabazs
2025-10-02 16:06:38 +00:00
parent ed13dc8c6c
commit bf150e442e

View File

@@ -22,9 +22,8 @@ from whisperx.types import (
SingleWordSegment, SingleWordSegment,
SegmentData, SegmentData,
) )
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters import nltk
from nltk.data import load as nltk_load
PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof', 'jr', 'sr', 'ph.d']
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"] LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
@@ -188,9 +187,11 @@ def align(
clean_wdx.append(wdx) clean_wdx.append(wdx)
punkt_param = PunktParameters() try:
punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS) sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
sentence_splitter = PunktSentenceTokenizer(punkt_param) except LookupError:
nltk.download('punkt_tab', quiet=True)
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
sentence_spans = list(sentence_splitter.span_tokenize(text)) sentence_spans = list(sentence_splitter.span_tokenize(text))
segment_data[sdx] = { segment_data[sdx] = {