Spaces:
Runtime error
Runtime error
### Imports | |
import spacy | |
from spacy.lang.en import English | |
from spacy import displacy | |
import pandas as pd | |
import traceback | |
class SpacySegmentizer: | |
##========================================================================================================== | |
""" | |
Definition of attributes | |
""" | |
__nlp_SpaCy = None | |
##========================================================================================================== | |
""" | |
Function: __init__ | |
""" | |
def __init__(self): | |
if self.__nlp_SpaCy == None: | |
print("Initializing spacy") | |
self.initialize_spacy() | |
##========================================================================================================== | |
""" | |
Function: initialize_spacy | |
""" | |
def initialize_spacy(self): | |
try: | |
self.__nlp_SpaCy = English() | |
#self.__nlp_spacy = spacy.load("en_core_web_sm") | |
self.__nlp_SpaCy.add_pipe("sentencizer") | |
#nlp.add_pipe("sentencizer", config={"punct_chars":[".", ";"]}) | |
except Exception as excmsg: | |
print(f"An error happens in initialize_spacy(...) {traceback.format_exc()}.") | |
self.__nlp_SpaCy = None | |
return self.__nlp_SpaCy | |
##========================================================================================================== | |
""" | |
Function: segment_into_sentences | |
""" | |
def segment_into_sentences(self, src_text="", _format=""): | |
intermediate_result = None | |
if isinstance(src_text, str): | |
intermediate_result = [s for s in (self.__nlp_SpaCy(src_text)).sents] | |
elif isinstance(src_text, list): | |
intermediate_result = list() | |
for sent in src_text: | |
intermediate_result.extend([s for s in (self.__nlp_SpaCy(sent)).sents]) | |
if _format == "str": | |
sentences_new_doc = list() | |
for intsent in intermediate_result: | |
sentences_new_doc.append(" ".join([str(s) for s in intsent])) | |
return sentences_new_doc | |
else: | |
return intermediate_result | |
##========================================================================================================== | |
##========================================================================================================== |