File size: 1,147 Bytes
06bd223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_top_keywords(text, top_n=5):
    """
    Extract top_n keywords from 'text' using a simple TF-IDF approach.
    Returns a list of strings (keywords).
    """
    # (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens
    cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
    
    # Initialize TF-IDF with English stop words
    vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
    
    # TF-IDF expects an iterable of documents, so wrap text in a list
    tfidf_matrix = vectorizer.fit_transform([cleaned_text])
    
    # Extract the feature names and the row (since there's only 1 doc, row=0)
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    
    # Pair up (feature_name, score)
    word_score_pairs = list(zip(feature_names, scores))
    
    # Sort by score descending
    word_score_pairs.sort(key=lambda x: x[1], reverse=True)
    
    # Return just the top_n words
    top_keywords = [w for (w, s) in word_score_pairs[:top_n]]
    return top_keywords