annikwag commited on
Commit
0059f4a
·
verified ·
1 Parent(s): 139362e

Update appStore/tfidf_extraction.py

Browse files
Files changed (1) hide show
  1. appStore/tfidf_extraction.py +21 -20
appStore/tfidf_extraction.py CHANGED
@@ -1,49 +1,50 @@
1
  import re
2
  import langdetect
3
- from stopwordsiso import stopwords, stopwords_json
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
- def detect_language(text):
7
  """Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es')."""
8
  try:
9
  return langdetect.detect(text)
10
  except:
11
- return 'en' # fallback to English
 
12
 
13
- def get_stopwords_for_language(lang_code):
14
  """
15
  Retrieve stopwords from stopwordsiso for a given language code.
16
- If not available, fallback to empty set.
17
  """
18
  lang_code = lang_code.lower()
19
- # stopwords_json is a dict of { 'en': [...], 'de': [...], ...}
20
- if lang_code in stopwords_json:
21
- # call stopwords(lang_code) to retrieve that language’s stopwords as a set
22
- return stopwords(lang_code)
23
  else:
24
- return set() # fallback to empty set
25
 
26
- def extract_top_keywords(text, top_n=5):
27
  """
28
- Extract top_n keywords from 'text' using a simple TF-IDF approach
29
- with language detection and language-specific stopwords.
30
  """
 
31
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
32
- lang_code = detect_language(cleaned_text)
33
 
 
 
34
  language_stopwords = get_stopwords_for_language(lang_code)
35
 
36
- # Pass your custom language_stopwords into TF-IDF:
37
  vectorizer = TfidfVectorizer(stop_words=language_stopwords)
38
  tfidf_matrix = vectorizer.fit_transform([cleaned_text])
39
 
40
  feature_names = vectorizer.get_feature_names_out()
41
- scores = tfidf_matrix.toarray()[0]
42
 
43
- # Pair (word, score), sort descending
44
  word_score_pairs = list(zip(feature_names, scores))
45
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
46
 
47
- # Return just the top_n words
48
- top_keywords = [word for (word, score) in word_score_pairs[:top_n]]
49
- return top_keywords
 
1
  import re
2
  import langdetect
3
+ from stopwordsiso import stopwords, has_lang
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
+ def detect_language(text: str) -> str:
7
  """Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es')."""
8
  try:
9
  return langdetect.detect(text)
10
  except:
11
+ # If detection fails or is uncertain, default to English
12
+ return 'en'
13
 
14
+ def get_stopwords_for_language(lang_code: str):
15
  """
16
  Retrieve stopwords from stopwordsiso for a given language code.
17
+ If not available, fallback to an empty set.
18
  """
19
  lang_code = lang_code.lower()
20
+ if has_lang(lang_code):
21
+ # has_lang(lang_code) checks if stopwordsiso supports that code
22
+ return stopwords(lang_code) # returns a set of stopwords
 
23
  else:
24
+ return set() # fallback if the language is unsupported
25
 
26
+ def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
27
  """
28
+ Extract top_n keywords from 'text' using TF-IDF,
29
+ language detection, and language-specific stopwords.
30
  """
31
+ # Basic cleanup: remove punctuation, lower the case, etc.
32
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
 
33
 
34
+ # Detect language and get appropriate stopwords
35
+ lang_code = detect_language(cleaned_text)
36
  language_stopwords = get_stopwords_for_language(lang_code)
37
 
38
+ # Build TF-IDF vectorizer with custom stopwords
39
  vectorizer = TfidfVectorizer(stop_words=language_stopwords)
40
  tfidf_matrix = vectorizer.fit_transform([cleaned_text])
41
 
42
  feature_names = vectorizer.get_feature_names_out()
43
+ scores = tfidf_matrix.toarray()[0] # only 1 row, since we have 1 doc
44
 
45
+ # Pair (word, score) and sort descending by score
46
  word_score_pairs = list(zip(feature_names, scores))
47
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
48
 
49
+ # Return the top N words
50
+ return [word for (word, _) in word_score_pairs[:top_n]]