# -*- coding: utf-8 -*- from pysbd.abbreviation_replacer import AbbreviationReplacer from pysbd.lang.common import Common, Standard from pysbd.between_punctuation import BetweenPunctuation import re from functools import partial from pysbd.punctuation_replacer import replace_punctuation class Armenian(Common, Standard): iso_code = 'hy' SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[Ա-ՖA-Z])|「(?:[^」])*」(?=\s[Ա-ՖA-Z])|\((?:[^\)]){2,}\)(?=\s[Ա-ՖA-Z])|\'(?:[^\'])*[^,]\'(?=\s[Ա-ՖA-Z])|\"(?:[^\"])*[^,]\"(?=\s[Ա-ՖA-Z])|\“(?:[^\”])*[^,]\”(?=\s[Ա-ՖA-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]|.*?(?[^"\\]+|\\{2}|\\.)*"' BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2 = r'"(?=(?P[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"' # Rubular: http://rubular.com/r/x6s4PZK8jc BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»' BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2 = r"\«(?=(?P[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»" # Rubular: http://rubular.com/r/JbAIpKdlSq BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”" BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2 = r"\“(?=(?P[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”" # Rubular: http://rubular.com/r/WX4AvnZvlX BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]" BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2 = r'\[(?=(?P[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]' # Rubular: http://rubular.com/r/6tTityPflI BETWEEN_PARENS_ARMENIAN_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)" BETWEEN_PARENS_ARMENIAN_REGEX_2 = r"\((?=(?P[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)" # Rubular: http://rubular.com/r/mXf8cW025o WORD_WITH_LEADING_APOSTROPHE_ARMENIAN = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'\S" # Rubular: http://rubular.com/r/jTtDKfjxzr BETWEEN_EM_DASHES_REGEX_ARMENIAN = r"\-\-(?>[^\-\-])*\-\-" BETWEEN_EM_DASHES_REGEX_2_ARMENIAN = r"--(?=(?P[^--]*))(?P=tmp)--" def __init__(self, text): super().__init__(text) def replace(self): text = self.sub_punctuation_between_quotes_and_parens(self.text) return self.sub_punctuation_between_quotes_and_parens_armenian(text) def sub_punctuation_between_quotes_and_parens_armenian(self, txt): txt = self.sub_punctuation_between_single_quotes_armenian(txt) txt = self.sub_punctuation_between_single_quote_slanted_armenian(txt) txt = self.sub_punctuation_between_double_quotes_armenian(txt) txt = self.sub_punctuation_between_square_brackets_armenian(txt) txt = self.sub_punctuation_between_parens_armenian(txt) txt = self.sub_punctuation_between_quotes_arrow_armenian(txt) txt = self.sub_punctuation_between_em_dashes_armenian(txt) txt = self.sub_punctuation_between_quotes_slanted_armenian(txt) return txt def sub_punctuation_between_single_quotes_armenian(self, txt): if re.search(self.WORD_WITH_LEADING_APOSTROPHE_ARMENIAN, txt) and \ (not re.search(r"'\s", txt)): return txt return re.sub(self.BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX, partial(replace_punctuation, match_type='single'), txt) def sub_punctuation_between_single_quote_slanted_armenian(self, txt): return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX, replace_punctuation, txt) def sub_punctuation_between_parens_armenian(self, txt): return re.sub(self.BETWEEN_PARENS_ARMENIAN_REGEX_2, replace_punctuation, txt) def sub_punctuation_between_square_brackets_armenian(self, txt): return re.sub(self.BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2, replace_punctuation, txt) def sub_punctuation_between_double_quotes_armenian(self, txt): return re.sub(self.BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2, replace_punctuation, txt) def sub_punctuation_between_quotes_arrow_armenian(self, txt): return re.sub(self.BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2, replace_punctuation, txt) def sub_punctuation_between_em_dashes_armenian(self, txt): return re.sub(self.BETWEEN_EM_DASHES_REGEX_2_ARMENIAN, replace_punctuation, txt) def sub_punctuation_between_quotes_slanted_armenian(self, txt): return re.sub(self.BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2, replace_punctuation, txt)