File size: 1,811 Bytes
42bcb30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.punctuation_replacer import replace_punctuation
from pysbd.cleaner import Cleaner
from pysbd.utils import Text, Rule

class Japanese(Common, Standard):

    iso_code = 'ja'

    class Cleaner(Cleaner):

        def __init__(self, text, lang, doc_type=None):
            super().__init__(text, lang)

        def clean(self):
            self.remove_newline_in_middle_of_word()
            return self.text

        def remove_newline_in_middle_of_word(self):
            NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
            self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)

    class AbbreviationReplacer(AbbreviationReplacer):
        SENTENCE_STARTERS = []

    class BetweenPunctuation(BetweenPunctuation):

        def __init__(self, text):
            super().__init__(text)

        def replace(self):
            self.sub_punctuation_between_quotes_and_parens()
            return self.text

        def sub_punctuation_between_parens_ja(self):
            BETWEEN_PARENS_JA_REGEX = r'((?=(?P<tmp>[^()]+|\\{2}|\\.)*)(?P=tmp))'
            self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
                      self.text)

        def sub_punctuation_between_quotes_ja(self):
            BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
            self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
                      self.text)

        def sub_punctuation_between_quotes_and_parens(self):
            self.sub_punctuation_between_parens_ja()
            self.sub_punctuation_between_quotes_ja()