Spaces:

GIZ
/

GIZ-Project-Search

Running on CPU Upgrade

GIZ-Project-Search / appStore /tfidf_extraction.py

Create tfidf_extraction.py

06bd223 verified 6 months ago

1.15 kB

	import re
	from sklearn.feature_extraction.text import TfidfVectorizer

	def extract_top_keywords(text, top_n=5):
	"""
	Extract top_n keywords from 'text' using a simple TF-IDF approach.
	Returns a list of strings (keywords).
	"""
	# (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens
	cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())

	# Initialize TF-IDF with English stop words
	vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)

	# TF-IDF expects an iterable of documents, so wrap text in a list
	tfidf_matrix = vectorizer.fit_transform([cleaned_text])

	# Extract the feature names and the row (since there's only 1 doc, row=0)
	feature_names = vectorizer.get_feature_names_out()
	scores = tfidf_matrix.toarray()[0]

	# Pair up (feature_name, score)
	word_score_pairs = list(zip(feature_names, scores))

	# Sort by score descending
	word_score_pairs.sort(key=lambda x: x[1], reverse=True)

	# Return just the top_n words
	top_keywords = [w for (w, s) in word_score_pairs[:top_n]]
	return top_keywords