annikwag commited on
Commit
139362e
·
verified ·
1 Parent(s): 5c3a945

Update appStore/tfidf_extraction.py

Browse files
Files changed (1) hide show
  1. appStore/tfidf_extraction.py +13 -23
appStore/tfidf_extraction.py CHANGED
@@ -1,19 +1,14 @@
1
  import re
2
  import langdetect
3
- from stopwordsiso import stopwords
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
-
7
  def detect_language(text):
8
- """
9
- Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es').
10
- If detection fails or is uncertain, fallback to 'en'.
11
- """
12
  try:
13
  return langdetect.detect(text)
14
  except:
15
- return 'en' # fallback
16
-
17
 
18
  def get_stopwords_for_language(lang_code):
19
  """
@@ -21,36 +16,31 @@ def get_stopwords_for_language(lang_code):
21
  If not available, fallback to empty set.
22
  """
23
  lang_code = lang_code.lower()
24
- if lang_code in stopwords.langdict:
25
- return stopwords.lang(lang_code)
 
 
26
  else:
27
  return set() # fallback to empty set
28
 
29
-
30
  def extract_top_keywords(text, top_n=5):
31
  """
32
- Extract top_n keywords from 'text' using a simple TF-IDF approach with
33
- language detection and language-specific stopwords.
34
  """
35
- # Clean the text (remove punctuation, lower the case, etc.)
36
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
37
-
38
- # Detect language
39
  lang_code = detect_language(cleaned_text)
40
-
41
- # Get the relevant stopwords
42
  language_stopwords = get_stopwords_for_language(lang_code)
43
 
44
- # Initialize TF-IDF with the custom language stop words
45
  vectorizer = TfidfVectorizer(stop_words=language_stopwords)
46
-
47
- # We pass in a list of one "document" to TF-IDF
48
  tfidf_matrix = vectorizer.fit_transform([cleaned_text])
49
 
50
  feature_names = vectorizer.get_feature_names_out()
51
- scores = tfidf_matrix.toarray()[0] # row 0 since we only have one doc
52
 
53
- # Pair (word, score), then sort descending by score
54
  word_score_pairs = list(zip(feature_names, scores))
55
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
56
 
 
1
  import re
2
  import langdetect
3
+ from stopwordsiso import stopwords, stopwords_json
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
 
6
  def detect_language(text):
7
+ """Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es')."""
 
 
 
8
  try:
9
  return langdetect.detect(text)
10
  except:
11
+ return 'en' # fallback to English
 
12
 
13
  def get_stopwords_for_language(lang_code):
14
  """
 
16
  If not available, fallback to empty set.
17
  """
18
  lang_code = lang_code.lower()
19
+ # stopwords_json is a dict of { 'en': [...], 'de': [...], ...}
20
+ if lang_code in stopwords_json:
21
+ # call stopwords(lang_code) to retrieve that language’s stopwords as a set
22
+ return stopwords(lang_code)
23
  else:
24
  return set() # fallback to empty set
25
 
 
26
  def extract_top_keywords(text, top_n=5):
27
  """
28
+ Extract top_n keywords from 'text' using a simple TF-IDF approach
29
+ with language detection and language-specific stopwords.
30
  """
 
31
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
 
 
32
  lang_code = detect_language(cleaned_text)
33
+
 
34
  language_stopwords = get_stopwords_for_language(lang_code)
35
 
36
+ # Pass your custom language_stopwords into TF-IDF:
37
  vectorizer = TfidfVectorizer(stop_words=language_stopwords)
 
 
38
  tfidf_matrix = vectorizer.fit_transform([cleaned_text])
39
 
40
  feature_names = vectorizer.get_feature_names_out()
41
+ scores = tfidf_matrix.toarray()[0]
42
 
43
+ # Pair (word, score), sort descending
44
  word_score_pairs = list(zip(feature_names, scores))
45
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
46