TopicModelingForSummarization / src /spacy_utilities.py
alpertml's picture
Upload 88 files
e2b1d98
### Imports
import spacy
from spacy.lang.en import English
from spacy import displacy
import pandas as pd
import traceback
class SpacySegmentizer:
##==========================================================================================================
"""
Definition of attributes
"""
__nlp_SpaCy = None
##==========================================================================================================
"""
Function: __init__
"""
def __init__(self):
if self.__nlp_SpaCy == None:
print("Initializing spacy")
self.initialize_spacy()
##==========================================================================================================
"""
Function: initialize_spacy
"""
def initialize_spacy(self):
try:
self.__nlp_SpaCy = English()
#self.__nlp_spacy = spacy.load("en_core_web_sm")
self.__nlp_SpaCy.add_pipe("sentencizer")
#nlp.add_pipe("sentencizer", config={"punct_chars":[".", ";"]})
except Exception as excmsg:
print(f"An error happens in initialize_spacy(...) {traceback.format_exc()}.")
self.__nlp_SpaCy = None
return self.__nlp_SpaCy
##==========================================================================================================
"""
Function: segment_into_sentences
"""
def segment_into_sentences(self, src_text="", _format=""):
intermediate_result = None
if isinstance(src_text, str):
intermediate_result = [s for s in (self.__nlp_SpaCy(src_text)).sents]
elif isinstance(src_text, list):
intermediate_result = list()
for sent in src_text:
intermediate_result.extend([s for s in (self.__nlp_SpaCy(sent)).sents])
if _format == "str":
sentences_new_doc = list()
for intsent in intermediate_result:
sentences_new_doc.append(" ".join([str(s) for s in intsent]))
return sentences_new_doc
else:
return intermediate_result
##==========================================================================================================
##==========================================================================================================