Videobook_v1.5

Runtime error

App Files Files Community

Videobook_v1.5 / Videobook /TextPreprocessor.py

Warlord-K

Fix Text Preprocessor

6cdb392 over 1 year ago

raw

history blame contribute delete

2.15 kB

	import nltk
	import spacy
	from fastcoref import spacy_component
	class TextPreprocessor:
	"""
	Class that Preprocesses text for the pipeline
	Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings.
	"""
	def __init__(self):

	nltk.download('punkt')
	spacy.cli.download("en_core_web_sm")
	self.nlp = spacy.load("en_core_web_sm")
	self.nlp.add_pipe(
	"fastcoref",
	config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
	)

	def coref(self, text = None):
	'''
	Does Coreference Resolution
	Parameters:
	text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.

	Returns:
	Coreference Resolved paragraph

	'''
	if not text:
	text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
	doc = self.nlp(
	text,
	component_cfg={"fastcoref": {'resolve_text': True}}
	)
	# Check doc._.coref_clusters for cluster info
	return doc._.resolved_text

	def neg_prompt(self,string : str):
	"""
	Splits Text Into Postive an Negative Prompt.
	"""
	positive = " "
	negative = " "
	words = nltk.word_tokenize(string)
	for i, word in enumerate(words[:-1]):
	if words[i+1].lower() not in ["n't", 'not']:
	positive += " " + word
	else:
	for wor in words[i+2:]:
	negative += " " + wor
	return {'pos':positive, 'neg': negative}
	if(words!=[]):
	positive+=words[-1]
	return {'pos':positive, 'neg': negative}

	def __call__(self, text):
	old_sentences = nltk.sent_tokenize(text)
	coref_text = self.coref(text)
	sentences = nltk.sent_tokenize(coref_text)
	processed_sentences = []
	for sentence in sentences:
	processed_sentences.append(self.neg_prompt(sentence))
	return processed_sentences, old_sentences