Spaces:
Runtime error
Runtime error
import nltk | |
import spacy | |
from fastcoref import spacy_component | |
class TextPreprocessor: | |
""" | |
Class that Preprocesses text for the pipeline | |
Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings. | |
""" | |
def __init__(self): | |
nltk.download('punkt') | |
spacy.cli.download("en_core_web_sm") | |
self.nlp = spacy.load("en_core_web_sm") | |
self.nlp.add_pipe( | |
"fastcoref", | |
config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'} | |
) | |
def coref(self, text = None): | |
''' | |
Does Coreference Resolution | |
Parameters: | |
text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations. | |
Returns: | |
Coreference Resolved paragraph | |
''' | |
if not text: | |
text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.' | |
doc = self.nlp( | |
text, | |
component_cfg={"fastcoref": {'resolve_text': True}} | |
) | |
# Check doc._.coref_clusters for cluster info | |
return doc._.resolved_text | |
def neg_prompt(self,string : str): | |
""" | |
Splits Text Into Postive an Negative Prompt. | |
""" | |
positive = " " | |
negative = " " | |
words = nltk.word_tokenize(string) | |
for i, word in enumerate(words[:-1]): | |
if words[i+1].lower() not in ["n't", 'not']: | |
positive += " " + word | |
else: | |
for wor in words[i+2:]: | |
negative += " " + wor | |
return {'pos':positive, 'neg': negative} | |
if(words!=[]): | |
positive+=words[-1] | |
return {'pos':positive, 'neg': negative} | |
def __call__(self, text): | |
old_sentences = nltk.sent_tokenize(text) | |
coref_text = self.coref(text) | |
sentences = nltk.sent_tokenize(coref_text) | |
processed_sentences = [] | |
for sentence in sentences: | |
processed_sentences.append(self.neg_prompt(sentence)) | |
return processed_sentences, old_sentences |