File size: 2,365 Bytes
e2b1d98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
### Imports
import spacy
from spacy.lang.en import English
from spacy import displacy

import pandas as pd

import traceback

class SpacySegmentizer:
    ##==========================================================================================================
    """
    Definition of attributes
    """
    __nlp_SpaCy = None
    ##==========================================================================================================
    """
    Function: __init__
    """
    def __init__(self):
        if self.__nlp_SpaCy == None:
            print("Initializing spacy")
            self.initialize_spacy()
    ##==========================================================================================================
    """
    Function: initialize_spacy
    """
    def initialize_spacy(self):
        try:
            self.__nlp_SpaCy = English()
            #self.__nlp_spacy = spacy.load("en_core_web_sm")
            self.__nlp_SpaCy.add_pipe("sentencizer")
            #nlp.add_pipe("sentencizer", config={"punct_chars":[".", ";"]})
        except Exception as excmsg:
            print(f"An error happens in initialize_spacy(...) {traceback.format_exc()}.")
            self.__nlp_SpaCy = None
        return self.__nlp_SpaCy
    ##==========================================================================================================
    """
    Function: segment_into_sentences
    """
    def segment_into_sentences(self, src_text="", _format=""):
        intermediate_result = None

        if isinstance(src_text, str):
            intermediate_result = [s for s in (self.__nlp_SpaCy(src_text)).sents]
        elif isinstance(src_text, list):
            intermediate_result = list()

            for sent in src_text:
                intermediate_result.extend([s for s in (self.__nlp_SpaCy(sent)).sents])

        if _format == "str":
            sentences_new_doc = list()
    
            for intsent in intermediate_result:
                sentences_new_doc.append(" ".join([str(s) for s in intsent]))
            
            return sentences_new_doc
        else:
            return intermediate_result
    ##==========================================================================================================

##==========================================================================================================