Spaces:
Sleeping
Sleeping
File size: 2,753 Bytes
42bcb30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# -*- coding: utf-8 -*-
from pysbd.utils import Rule
class CleanRules(object):
# NOTE: Caution: Might require \\ for special characters
# if regex is defined with r'' then dont
# add extra \\ for special characters
# Rubular: http://rubular.com/r/V57WnM9Zut
NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')
# Rubular: http://rubular.com/r/dMxp5MixFS
DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")
# Rubular: http://rubular.com/r/H6HOJeA8bq
DoubleNewLineRule = Rule(r'\n\n', "\r")
# Rubular: http://rubular.com/r/FseyMiiYFT
NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')
ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")
EscapedNewLineRule = Rule(r'\\n', "\n")
EscapedCarriageReturnRule = Rule(r'\\r', "\r")
TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")
TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")
# Rubular: http://rubular.com/r/bAJrhyLNeZ
InlineFormattingRule = Rule(r'{b\^>\d*<b\^}|{b\^>\d*<b\^}', '')
# Rubular: http://rubular.com/r/8mc1ArOIGy
TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")
# Rubular: http://rubular.com/r/DwNSuZrNtk
ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')
# Rubular: http://rubular.com/r/IQ4TPfsbd8
ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')
# Rubular: http://rubular.com/r/6dt98uI76u
NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
# NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
# Rubular: http://rubular.com/r/l6KN6rH5XE
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
# Rubular: http://rubular.com/r/3GiRiP2IbD
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'
# Rubular: http://rubular.com/r/Gn18aAnLdZ
NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")
QuotationsFirstRule = Rule(r"''", '"')
QuotationsSecondRule = Rule(r'``', '"')
class HTML(object):
# Rubular: http://rubular.com/r/9d0OVOEJWj
HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '')
# Rubular: http://rubular.com/r/XZVqMPJhea
EscapedHTMLTagRule = Rule(r'<\/?[^gt;]*gt;', '')
All = [HTMLTagRule, EscapedHTMLTagRule]
class PDF(object):
# Rubular: http://rubular.com/r/UZAVcwqck8
NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')
# Rubular: http://rubular.com/r/eaNwGavmdo
NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
|