Spaces:

AriNubar
/

xcl-en-demo

Running

Ari Nubar Boyacıoğlu

add files

42bcb30 6 months ago

1.81 kB

	# -- coding: utf-8 --
	import re
	from pysbd.abbreviation_replacer import AbbreviationReplacer
	from pysbd.between_punctuation import BetweenPunctuation
	from pysbd.lang.common import Common, Standard
	from pysbd.punctuation_replacer import replace_punctuation
	from pysbd.cleaner import Cleaner
	from pysbd.utils import Text, Rule

	class Japanese(Common, Standard):

	iso_code = 'ja'

	class Cleaner(Cleaner):

	def __init__(self, text, lang, doc_type=None):
	super().__init__(text, lang)

	def clean(self):
	self.remove_newline_in_middle_of_word()
	return self.text

	def remove_newline_in_middle_of_word(self):
	NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
	self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)

	class AbbreviationReplacer(AbbreviationReplacer):
	SENTENCE_STARTERS = []

	class BetweenPunctuation(BetweenPunctuation):

	def __init__(self, text):
	super().__init__(text)

	def replace(self):
	self.sub_punctuation_between_quotes_and_parens()
	return self.text

	def sub_punctuation_between_parens_ja(self):
	BETWEEN_PARENS_JA_REGEX = r'（(?=(?P<tmp>[^（）]+\|\\{2}\|\\.)*)(?P=tmp)）'
	self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
	self.text)

	def sub_punctuation_between_quotes_ja(self):
	BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+\|\\{2}\|\\.)*)(?P=tmp)」'
	self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
	self.text)

	def sub_punctuation_between_quotes_and_parens(self):
	self.sub_punctuation_between_parens_ja()
	self.sub_punctuation_between_quotes_ja()