Spaces:
Runtime error
Runtime error
| import nltk | |
| import spacy | |
| from fastcoref import spacy_component | |
| class TextPreprocessor: | |
| """ | |
| Class that Preprocesses text for the pipeline | |
| Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings. | |
| """ | |
| def __init__(self): | |
| nltk.download('punkt') | |
| spacy.cli.download("en_core_web_sm") | |
| self.nlp = spacy.load("en_core_web_sm") | |
| self.nlp.add_pipe( | |
| "fastcoref", | |
| config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'} | |
| ) | |
| def coref(self, text = None): | |
| ''' | |
| Does Coreference Resolution | |
| Parameters: | |
| text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations. | |
| Returns: | |
| Coreference Resolved paragraph | |
| ''' | |
| if not text: | |
| text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.' | |
| doc = self.nlp( | |
| text, | |
| component_cfg={"fastcoref": {'resolve_text': True}} | |
| ) | |
| # Check doc._.coref_clusters for cluster info | |
| return doc._.resolved_text | |
| def neg_prompt(self,string : str): | |
| """ | |
| Splits Text Into Postive an Negative Prompt. | |
| """ | |
| positive = " " | |
| negative = " " | |
| words = nltk.word_tokenize(string) | |
| for i, word in enumerate(words[:-1]): | |
| if words[i+1].lower() not in ["n't", 'not']: | |
| positive += " " + word | |
| else: | |
| for wor in words[i+2:]: | |
| negative += " " + wor | |
| return {'pos':positive, 'neg': negative} | |
| if(words!=[]): | |
| positive+=words[-1] | |
| return {'pos':positive, 'neg': negative} | |
| def __call__(self, text): | |
| old_sentences = nltk.sent_tokenize(text) | |
| coref_text = self.coref(text) | |
| sentences = nltk.sent_tokenize(coref_text) | |
| processed_sentences = [] | |
| for sentence in sentences: | |
| processed_sentences.append(self.neg_prompt(sentence)) | |
| return processed_sentences, old_sentences |