annikwag commited on
Commit
f45fad6
·
verified ·
1 Parent(s): 3346614

Update appStore/tfidf_extraction.py

Browse files
Files changed (1) hide show
  1. appStore/tfidf_extraction.py +20 -12
appStore/tfidf_extraction.py CHANGED
@@ -3,33 +3,41 @@ import langdetect
3
  from stopwordsiso import stopwords, has_lang
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
- def detect_language(text: str) -> str:
7
  try:
8
  return langdetect.detect(text)
9
  except:
10
- return "en" # fallback if detection fails
11
 
12
- def get_stopwords_for_language(lang_code: str):
13
  lang_code = lang_code.lower()
14
  if has_lang(lang_code):
15
- return stopwords(lang_code) # returns a *set* of stopwords
16
- else:
17
- return set()
18
 
19
- def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
 
20
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
21
-
22
  lang_code = detect_language(cleaned_text)
23
  language_stopwords = get_stopwords_for_language(lang_code)
24
 
25
- # Convert the set to a list here!
26
- vectorizer = TfidfVectorizer(stop_words=list(language_stopwords))
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- tfidf_matrix = vectorizer.fit_transform([cleaned_text])
29
  feature_names = vectorizer.get_feature_names_out()
30
  scores = tfidf_matrix.toarray()[0]
31
 
32
- # Pair up (word, score), then sort descending
33
  word_score_pairs = list(zip(feature_names, scores))
34
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
35
 
 
3
  from stopwordsiso import stopwords, has_lang
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
+ def detect_language(text):
7
  try:
8
  return langdetect.detect(text)
9
  except:
10
+ return "en"
11
 
12
+ def get_stopwords_for_language(lang_code):
13
  lang_code = lang_code.lower()
14
  if has_lang(lang_code):
15
+ return stopwords(lang_code) # returns a set of stopwords
16
+ return set()
 
17
 
18
+ def extract_top_keywords(text, top_n=5):
19
+ # Basic cleanup
20
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
 
21
  lang_code = detect_language(cleaned_text)
22
  language_stopwords = get_stopwords_for_language(lang_code)
23
 
24
+ # Convert stopwords set to list because TfidfVectorizer needs list/None/'english'
25
+ stopwords_list = list(language_stopwords)
26
+
27
+ vectorizer = TfidfVectorizer(stop_words=stopwords_list)
28
+
29
+ try:
30
+ tfidf_matrix = vectorizer.fit_transform([cleaned_text])
31
+ except ValueError as e:
32
+ # If there's nothing left after removing stopwords/punctuation
33
+ if "empty vocabulary" in str(e).lower():
34
+ return [] # Return an empty list -> no keywords
35
+ else:
36
+ raise e # Something else went wrong
37
 
 
38
  feature_names = vectorizer.get_feature_names_out()
39
  scores = tfidf_matrix.toarray()[0]
40
 
 
41
  word_score_pairs = list(zip(feature_names, scores))
42
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
43