File size: 4,076 Bytes
42bcb30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# -*- coding: utf-8 -*-
import re

from pysbd.languages import Language
from pysbd.processor import Processor
from pysbd.cleaner import Cleaner
from pysbd.utils import TextSpan

class Segmenter(object):

    def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
        """Segments a text into an list of sentences
        with or withour character offsets from original text

        Parameters
        ----------
        language : str, required
            specify a language use its two character ISO 639-1 code,
            by default "en"
        clean : bool, optional
            cleans original text, by default False
        doc_type : [type], optional
            Normal text or OCRed text, by default None
            set to `pdf` for OCRed text
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        """
        self.language = language
        self.language_module = Language.get_language_code(language)
        self.clean = clean
        self.doc_type = doc_type
        self.char_span = char_span
        if self.clean and self.char_span:
            raise ValueError("char_span must be False if clean is True. "
                             "Since `clean=True` will modify original text.")
        # when doctype is pdf then force user to clean the text
        # char_span func wont be provided with pdf doctype also
        elif self.doc_type == 'pdf' and not self.clean:
            raise ValueError("`doc_type='pdf'` should have `clean=True` & "
                            "`char_span` should be False since original"
                            "text will be modified.")

    def cleaner(self, text):
        if hasattr(self.language_module, "Cleaner"):
            return self.language_module.Cleaner(text, self.language_module,
                                                doc_type=self.doc_type)
        else:
            return Cleaner(text, self.language_module, doc_type=self.doc_type)

    def processor(self, text):
        if hasattr(self.language_module, "Processor"):
            return self.language_module.Processor(text, self.language_module,
                                                  char_span=self.char_span)
        else:
            return Processor(text, self.language_module,
                             char_span=self.char_span)

    def sentences_with_char_spans(self, sentences):
        # since SENTENCE_BOUNDARY_REGEX doesnt account
        # for trailing whitespaces \s* & is used as suffix
        # to keep non-destructive text after segments joins
        sent_spans = []
        prior_end_char_idx = 0
        for sent in sentences:
            for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
                match_str = match.group()
                match_start_idx, match_end_idx = match.span()
                if match_end_idx > prior_end_char_idx:
                    # making sure if curren sentence and its span
                    # is either first sentence along with its char spans
                    # or current sent spans adjacent to prior sentence spans
                    sent_spans.append(
                        TextSpan(match_str, match_start_idx, match_end_idx))
                    prior_end_char_idx = match_end_idx
                    break
        return sent_spans

    def segment(self, text):
        self.original_text = text
        if not text:
            return []

        if self.clean or self.doc_type == 'pdf':
            text = self.cleaner(text).clean()

        postprocessed_sents = self.processor(text).process()
        sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
        if self.char_span:
            return sentence_w_char_spans
        elif self.clean:
            # clean and destructed sentences
            return postprocessed_sents
        else:
            # nondestructive with whitespaces
            return [textspan.sent for textspan in sentence_w_char_spans]