File size: 2,211 Bytes
e2b1d98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
### Imports
import stanza
import pandas as pd
import traceback

class StanzaSegmentizer:
    ##==========================================================================================================
    """
    Definition of attributes
    """
    __nlp_stanza = None
    ##==========================================================================================================
    """
    Function: __init__
    """
    def __init__(self):
        try:
            if self.__nlp_stanza == None:
                print("Initializing stanza")
                self.initialize_stanza()
        except Exception as excMsg:
            print(excMsg)
    ##==========================================================================================================
    """
    Function: initialize_stanza
    """
    def initialize_stanza(self):
        try:            
            self.__nlp_stanza = stanza.Pipeline('en')
        except Exception as excmsg:
            print(f"An error happens in initialize_spacy(...) {traceback.format_exc()}.")
            self.__nlp_stanza = None
        return self.__nlp_stanza
    ##==========================================================================================================
    """
    Function: segment_into_sentences
    """
    def segment_into_sentences(self, src_text="", _format="str"):
        intermediate_result = None

        if isinstance(src_text, str):
            intermediate_result = [s for s in (self.__nlp_stanza(src_text)).sentences]
        elif isinstance(src_text, list):
            intermediate_result = list()

            for sent in src_text:
                intermediate_result.extend([s for s in (self.__nlp_stanza(sent)).sentences])

        if _format == "str":
            sentences_new_doc = list()
    
            for intsent in intermediate_result:
                sentences_new_doc.append(intsent.text)
            return sentences_new_doc
        else:
            return intermediate_result
    ##==========================================================================================================

##==========================================================================================================