annikwag commited on
Commit
daaad57
·
verified ·
1 Parent(s): 1f5c24c

Update appStore/tfidf_extraction.py

Browse files
Files changed (1) hide show
  1. appStore/tfidf_extraction.py +46 -17
appStore/tfidf_extraction.py CHANGED
@@ -1,30 +1,59 @@
1
  import re
 
 
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def extract_top_keywords(text, top_n=5):
5
  """
6
- Extract top_n keywords from 'text' using a simple TF-IDF approach.
7
- Returns a list of strings (keywords).
8
  """
9
- # (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens
10
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
11
-
12
- # Initialize TF-IDF with English stop words
13
- vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
14
-
15
- # TF-IDF expects an iterable of documents, so wrap text in a list
 
 
 
 
 
 
16
  tfidf_matrix = vectorizer.fit_transform([cleaned_text])
17
-
18
- # Extract the feature names and the row (since there's only 1 doc, row=0)
19
  feature_names = vectorizer.get_feature_names_out()
20
- scores = tfidf_matrix.toarray()[0]
21
-
22
- # Pair up (feature_name, score)
23
  word_score_pairs = list(zip(feature_names, scores))
24
-
25
- # Sort by score descending
26
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
27
-
28
  # Return just the top_n words
29
- top_keywords = [w for (w, s) in word_score_pairs[:top_n]]
30
  return top_keywords
 
1
  import re
2
+ import langdetect
3
+ from stopwordsiso import stopwords
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
 
6
+
7
+ def detect_language(text):
8
+ """
9
+ Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es').
10
+ If detection fails or is uncertain, fallback to 'en'.
11
+ """
12
+ try:
13
+ return langdetect.detect(text)
14
+ except:
15
+ return 'en' # fallback
16
+
17
+
18
+ def get_stopwords_for_language(lang_code):
19
+ """
20
+ Retrieve stopwords from stopwordsiso for a given language code.
21
+ If not available, fallback to empty set.
22
+ """
23
+ lang_code = lang_code.lower()
24
+ if lang_code in stopwords.langdict:
25
+ return stopwords.lang(lang_code)
26
+ else:
27
+ return set() # fallback to empty set
28
+
29
+
30
  def extract_top_keywords(text, top_n=5):
31
  """
32
+ Extract top_n keywords from 'text' using a simple TF-IDF approach with
33
+ language detection and language-specific stopwords.
34
  """
35
+ # Clean the text (remove punctuation, lower the case, etc.)
36
  cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
37
+
38
+ # Detect language
39
+ lang_code = detect_language(cleaned_text)
40
+
41
+ # Get the relevant stopwords
42
+ language_stopwords = get_stopwords_for_language(lang_code)
43
+
44
+ # Initialize TF-IDF with the custom language stop words
45
+ vectorizer = TfidfVectorizer(stop_words=language_stopwords)
46
+
47
+ # We pass in a list of one "document" to TF-IDF
48
  tfidf_matrix = vectorizer.fit_transform([cleaned_text])
49
+
 
50
  feature_names = vectorizer.get_feature_names_out()
51
+ scores = tfidf_matrix.toarray()[0] # row 0 since we only have one doc
52
+
53
+ # Pair (word, score), then sort descending by score
54
  word_score_pairs = list(zip(feature_names, scores))
 
 
55
  word_score_pairs.sort(key=lambda x: x[1], reverse=True)
56
+
57
  # Return just the top_n words
58
+ top_keywords = [word for (word, score) in word_score_pairs[:top_n]]
59
  return top_keywords