submission-template

Sleeping

submission-template / tasks /utils /preprocessing.py

Update tasks/utils/preprocessing.py

58fe3bc verified 11 months ago

1.1 kB

	import nltk
	from nltk.corpus import stopwords
	import spacy
	import string
	import re


	nltk.download('stopwords')
	# Get the list of English stop words from NLTK
	nltk_stop_words = stopwords.words('english')
	# Load the spaCy model for English
	nlp = spacy.load("en_core_web_sm")


	def process_text(text):
	"""
	Process text by:
	1. Lowercasing
	2. Removing punctuation and non-alphanumeric characters
	3. Removing stop words
	4. Lemmatization
	"""
	# Step 1: Tokenization & Processing with spaCy
	doc = nlp(text.lower()) # Process text with spaCy

	# Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
	processed_tokens = [
	re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
	for token in doc
	if token.text not in nltk_stop_words and token.text not in string.punctuation
	]

	# Optional: Filter out empty strings resulting from the regex replacement
	processed_tokens = " ".join([word for word in processed_tokens if word])

	return processed_tokens