feat: update Punkt tokenizer to use pre-trained model and handle missing data
This commit is contained in:
@@ -22,9 +22,8 @@ from whisperx.types import (
|
|||||||
SingleWordSegment,
|
SingleWordSegment,
|
||||||
SegmentData,
|
SegmentData,
|
||||||
)
|
)
|
||||||
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
|
import nltk
|
||||||
|
from nltk.data import load as nltk_load
|
||||||
PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof', 'jr', 'sr', 'ph.d']
|
|
||||||
|
|
||||||
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
|
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
|
||||||
|
|
||||||
@@ -188,9 +187,11 @@ def align(
|
|||||||
clean_wdx.append(wdx)
|
clean_wdx.append(wdx)
|
||||||
|
|
||||||
|
|
||||||
punkt_param = PunktParameters()
|
try:
|
||||||
punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
|
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
|
||||||
sentence_splitter = PunktSentenceTokenizer(punkt_param)
|
except LookupError:
|
||||||
|
nltk.download('punkt_tab', quiet=True)
|
||||||
|
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
|
||||||
sentence_spans = list(sentence_splitter.span_tokenize(text))
|
sentence_spans = list(sentence_splitter.span_tokenize(text))
|
||||||
|
|
||||||
segment_data[sdx] = {
|
segment_data[sdx] = {
|
||||||
|
|||||||
Reference in New Issue
Block a user