VALL-E-X3

Sleeping

VALL-E-X3 / utils /sentence_cutter.py

Replaced Encodec with Vocos

c0d010f about 2 years ago

1.35 kB

	import nltk
	import jieba
	import sudachipy
	import langid
	nltk.download('punkt')
	langid.set_languages(['en', 'zh', 'ja'])

	def split_text_into_sentences(text):
	if langid.classify(text)[0] == "en":
	sentences = nltk.tokenize.sent_tokenize(text)

	return sentences
	elif langid.classify(text)[0] == "zh":
	sentences = []
	segs = jieba.cut(text, cut_all=False)
	segs = list(segs)
	start = 0
	for i, seg in enumerate(segs):
	if seg in ["。", "！", "？", "……"]:
	sentences.append("".join(segs[start:i + 1]))
	start = i + 1
	if start < len(segs):
	sentences.append("".join(segs[start:]))

	return sentences
	elif langid.classify(text)[0] == "ja":
	sentences = []
	tokenizer = sudachipy.Dictionary().create()
	tokens = tokenizer.tokenize(text)
	current_sentence = ""

	for token in tokens:
	current_sentence += token.surface()
	if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点":
	sentences.append(current_sentence)
	current_sentence = ""

	if current_sentence:
	sentences.append(current_sentence)

	return sentences

	raise RuntimeError("It is impossible to reach here.")