annikwag commited on
Commit
3a726ab
·
verified ·
1 Parent(s): 082c0d4

Update appStore/tfidf_extraction.py

Browse files
Files changed (1) hide show
  1. appStore/tfidf_extraction.py +11 -25
appStore/tfidf_extraction.py CHANGED
@@ -4,47 +4,33 @@ from stopwordsiso import stopwords, has_lang
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
  def detect_language(text: str) -> str:
7
- """Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es')."""
8
  try:
9
  return langdetect.detect(text)
10
  except:
11
- # If detection fails or is uncertain, default to English
12
- return 'en'
13
 
14
  def get_stopwords_for_language(lang_code: str):
15
- """
16
- Retrieve stopwords from stopwordsiso for a given language code.
17
- If not available, fallback to an empty set.
18
- """
19
  lang_code = lang_code.lower()
20
- if has_lang(lang_code):
21
- # has_lang(lang_code) checks if stopwordsiso supports that code
22
- return stopwords(lang_code) # returns a set of stopwords
23
  else:
24
- return set() # fallback if the language is unsupported
25
 
26
  def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
27
- """
28
- Extract top_n keywords from 'text' using TF-IDF,
29
- language detection, and language-specific stopwords.
30
- """
31
- # Basic cleanup: remove punctuation, lower the case, etc.
32
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
33
-
34
- # Detect language and get appropriate stopwords
35
  lang_code = detect_language(cleaned_text)
36
  language_stopwords = get_stopwords_for_language(lang_code)
37
 
38
- # Build TF-IDF vectorizer with custom stopwords
39
- vectorizer = TfidfVectorizer(stop_words=language_stopwords)
40
- tfidf_matrix = vectorizer.fit_transform([cleaned_text])
41
 
 
42
  feature_names = vectorizer.get_feature_names_out()
43
- scores = tfidf_matrix.toarray()[0] # only 1 row, since we have 1 doc
44
 
45
- # Pair (word, score) and sort descending by score
46
  word_score_pairs = list(zip(feature_names, scores))
47
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
48
 
49
- # Return the top N words
50
- return [word for (word, _) in word_score_pairs[:top_n]]
 
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
  def detect_language(text: str) -> str:
 
7
  try:
8
  return langdetect.detect(text)
9
  except:
10
+ return "en" # fallback if detection fails
 
11
 
12
  def get_stopwords_for_language(lang_code: str):
 
 
 
 
13
  lang_code = lang_code.lower()
14
+ if has_lang(lang_code):
15
+ return stopwords(lang_code) # returns a *set* of stopwords
 
16
  else:
17
+ return set()
18
 
19
  def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
 
 
 
 
 
20
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
21
+
 
22
  lang_code = detect_language(cleaned_text)
23
  language_stopwords = get_stopwords_for_language(lang_code)
24
 
25
+ # Convert the set to a list here!
26
+ vectorizer = TfidfVectorizer(stop_words=list(language_stopwords))
 
27
 
28
+ tfidf_matrix = vectorizer.fit_transform([cleaned_text])
29
  feature_names = vectorizer.get_feature_names_out()
30
+ scores = tfidf_matrix.toarray()[0]
31
 
32
+ # Pair up (word, score), then sort descending
33
  word_score_pairs = list(zip(feature_names, scores))
34
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
35
 
36
+ return [w for (w, _) in word_score_pairs[:top_n]]