Spaces:

GIZ
/

GIZ-Project-Search

Running on CPU Upgrade

annikwag commited on Jan 7

Commit

06bd223

verified ·

1 Parent(s): 3a0d69c

Create tfidf_extraction.py

Files changed (1) hide show

appStore/tfidf_extraction.py ADDED Viewed

+import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+def extract_top_keywords(text, top_n=5):
+    """
+    Extract top_n keywords from 'text' using a simple TF-IDF approach.
+    Returns a list of strings (keywords).
+    """
+    # (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens
+    cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
+    # Initialize TF-IDF with English stop words
+    vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
+    # TF-IDF expects an iterable of documents, so wrap text in a list
+    tfidf_matrix = vectorizer.fit_transform([cleaned_text])
+    # Extract the feature names and the row (since there's only 1 doc, row=0)
+    feature_names = vectorizer.get_feature_names_out()
+    scores = tfidf_matrix.toarray()[0]
+    # Pair up (feature_name, score)
+    word_score_pairs = list(zip(feature_names, scores))
+    # Sort by score descending
+    word_score_pairs.sort(key=lambda x: x[1], reverse=True)
+    # Return just the top_n words
+    top_keywords = [w for (w, s) in word_score_pairs[:top_n]]
+    return top_keywords