annikwag commited on
Commit
06bd223
·
verified ·
1 Parent(s): 3a0d69c

Create tfidf_extraction.py

Browse files
Files changed (1) hide show
  1. appStore/tfidf_extraction.py +30 -0
appStore/tfidf_extraction.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+
4
+ def extract_top_keywords(text, top_n=5):
5
+ """
6
+ Extract top_n keywords from 'text' using a simple TF-IDF approach.
7
+ Returns a list of strings (keywords).
8
+ """
9
+ # (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens
10
+ cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
11
+
12
+ # Initialize TF-IDF with English stop words
13
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
14
+
15
+ # TF-IDF expects an iterable of documents, so wrap text in a list
16
+ tfidf_matrix = vectorizer.fit_transform([cleaned_text])
17
+
18
+ # Extract the feature names and the row (since there's only 1 doc, row=0)
19
+ feature_names = vectorizer.get_feature_names_out()
20
+ scores = tfidf_matrix.toarray()[0]
21
+
22
+ # Pair up (feature_name, score)
23
+ word_score_pairs = list(zip(feature_names, scores))
24
+
25
+ # Sort by score descending
26
+ word_score_pairs.sort(key=lambda x: x[1], reverse=True)
27
+
28
+ # Return just the top_n words
29
+ top_keywords = [w for (w, s) in word_score_pairs[:top_n]]
30
+ return top_keywords