File size: 5,334 Bytes
42bcb30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.lang.common import Common, Standard
from pysbd.between_punctuation import BetweenPunctuation
import re
from functools import partial
from pysbd.punctuation_replacer import replace_punctuation

class Armenian(Common, Standard):

    iso_code = 'hy'

    SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[Ա-ՖA-Z])|「(?:[^」])*」(?=\s[Ա-ՖA-Z])|\((?:[^\)]){2,}\)(?=\s[Ա-ՖA-Z])|\'(?:[^\'])*[^,]\'(?=\s[Ա-ՖA-Z])|\"(?:[^\"])*[^,]\"(?=\s[Ա-ՖA-Z])|\“(?:[^\”])*[^,]\”(?=\s[Ա-ՖA-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]|.*?(?<!\d)[։]"


    # SENTENCE_BOUNDARY_REGEX = r'((?:[^)])*)(?=\s?[Ա-ՖA-Z0-9])|.*?(?<!\d)[։]|.*?$'
    Punctuations = ['։']


    QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[.․։][\"\'“”»«]\s{1}[A-ZԱ-Ֆ]'

    SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[.․։][\"\'“”»«])\s{1}(?=[A-ZԱ-Ֆ])'

    class AbbreviationReplacer(AbbreviationReplacer):
        SENTENCE_STARTERS = []

    class BetweenPunctuation(BetweenPunctuation):
        BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'"

        BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX = r"(?<=\s)‘(?:[^’]|’[ա-ֆԱ-Ֆ])*’"

        BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'

        BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'

        # Rubular: http://rubular.com/r/x6s4PZK8jc
        BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'

        BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"

        # Rubular: http://rubular.com/r/JbAIpKdlSq
        BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
        BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"

        # Rubular: http://rubular.com/r/WX4AvnZvlX
        BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"

        BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'

        # Rubular: http://rubular.com/r/6tTityPflI
        BETWEEN_PARENS_ARMENIAN_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"

        BETWEEN_PARENS_ARMENIAN_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"

        # Rubular: http://rubular.com/r/mXf8cW025o
        WORD_WITH_LEADING_APOSTROPHE_ARMENIAN = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'\S"

        # Rubular: http://rubular.com/r/jTtDKfjxzr
        BETWEEN_EM_DASHES_REGEX_ARMENIAN = r"\-\-(?>[^\-\-])*\-\-"

        BETWEEN_EM_DASHES_REGEX_2_ARMENIAN = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"

        def __init__(self, text):
            super().__init__(text)

        def replace(self):
            text = self.sub_punctuation_between_quotes_and_parens(self.text)
            return self.sub_punctuation_between_quotes_and_parens_armenian(text)

        def sub_punctuation_between_quotes_and_parens_armenian(self, txt):
            txt = self.sub_punctuation_between_single_quotes_armenian(txt)
            txt = self.sub_punctuation_between_single_quote_slanted_armenian(txt)
            txt = self.sub_punctuation_between_double_quotes_armenian(txt)
            txt = self.sub_punctuation_between_square_brackets_armenian(txt)
            txt = self.sub_punctuation_between_parens_armenian(txt)
            txt = self.sub_punctuation_between_quotes_arrow_armenian(txt)
            txt = self.sub_punctuation_between_em_dashes_armenian(txt)
            txt = self.sub_punctuation_between_quotes_slanted_armenian(txt)
            return txt

        def sub_punctuation_between_single_quotes_armenian(self, txt):
            if re.search(self.WORD_WITH_LEADING_APOSTROPHE_ARMENIAN, txt) and \
                    (not re.search(r"'\s", txt)):
                return txt
            return re.sub(self.BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX,
                        partial(replace_punctuation, match_type='single'), txt)

        def sub_punctuation_between_single_quote_slanted_armenian(self, txt):
            return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX,
                        replace_punctuation, txt)


        def sub_punctuation_between_parens_armenian(self, txt):
            return re.sub(self.BETWEEN_PARENS_ARMENIAN_REGEX_2, replace_punctuation, txt)

        def sub_punctuation_between_square_brackets_armenian(self, txt):
            return re.sub(self.BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2, replace_punctuation,
                        txt)

        def sub_punctuation_between_double_quotes_armenian(self, txt):
            return re.sub(self.BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2, replace_punctuation,
                        txt)

        def sub_punctuation_between_quotes_arrow_armenian(self, txt):
            return re.sub(self.BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2, replace_punctuation, txt)

        def sub_punctuation_between_em_dashes_armenian(self, txt):
            return re.sub(self.BETWEEN_EM_DASHES_REGEX_2_ARMENIAN, replace_punctuation, txt)

        def sub_punctuation_between_quotes_slanted_armenian(self, txt):
            return re.sub(self.BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2, replace_punctuation,
                        txt)