import re from sklearn.feature_extraction.text import TfidfVectorizer def extract_top_keywords(text, top_n=5): """ Extract top_n keywords from 'text' using a simple TF-IDF approach. Returns a list of strings (keywords). """ # (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens cleaned_text = re.sub(r"[^\w\s]", " ", text.lower()) # Initialize TF-IDF with English stop words vectorizer = TfidfVectorizer(stop_words='english', max_features=2000) # TF-IDF expects an iterable of documents, so wrap text in a list tfidf_matrix = vectorizer.fit_transform([cleaned_text]) # Extract the feature names and the row (since there's only 1 doc, row=0) feature_names = vectorizer.get_feature_names_out() scores = tfidf_matrix.toarray()[0] # Pair up (feature_name, score) word_score_pairs = list(zip(feature_names, scores)) # Sort by score descending word_score_pairs.sort(key=lambda x: x[1], reverse=True) # Return just the top_n words top_keywords = [w for (w, s) in word_score_pairs[:top_n]] return top_keywords