Spaces:

GIZ
/

GIZ-Project-Search

Running on CPU Upgrade

App Files Files Community

annikwag commited on Jan 7

Commit

daaad57

verified ·

1 Parent(s): 1f5c24c

Update appStore/tfidf_extraction.py

Browse files

Files changed (1) hide show

appStore/tfidf_extraction.py +46 -17

appStore/tfidf_extraction.py CHANGED Viewed

@@ -1,30 +1,59 @@
 import re
 from sklearn.feature_extraction.text import TfidfVectorizer
 def extract_top_keywords(text, top_n=5):
     """
-    Extract top_n keywords from 'text' using a simple TF-IDF approach.
-    Returns a list of strings (keywords).
     """
-    # (Optional) remove punctuation etc. so that TF-IDF doesn't see them as separate tokens
     cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
-    # Initialize TF-IDF with English stop words
-    vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
-    # TF-IDF expects an iterable of documents, so wrap text in a list
     tfidf_matrix = vectorizer.fit_transform([cleaned_text])
-    # Extract the feature names and the row (since there's only 1 doc, row=0)
     feature_names = vectorizer.get_feature_names_out()
-    scores = tfidf_matrix.toarray()[0]
-    # Pair up (feature_name, score)
     word_score_pairs = list(zip(feature_names, scores))
-    # Sort by score descending
     word_score_pairs.sort(key=lambda x: x[1], reverse=True)
     # Return just the top_n words
-    top_keywords = [w for (w, s) in word_score_pairs[:top_n]]
     return top_keywords

 import re
+import langdetect
+from stopwordsiso import stopwords
 from sklearn.feature_extraction.text import TfidfVectorizer
+def detect_language(text):
+    """
+    Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es').
+    If detection fails or is uncertain, fallback to 'en'.
+    """
+    try:
+        return langdetect.detect(text)
+    except:
+        return 'en'  # fallback
+def get_stopwords_for_language(lang_code):
+    """
+    Retrieve stopwords from stopwordsiso for a given language code.
+    If not available, fallback to empty set.
+    """
+    lang_code = lang_code.lower()
+    if lang_code in stopwords.langdict:
+        return stopwords.lang(lang_code)
+    else:
+        return set()  # fallback to empty set
 def extract_top_keywords(text, top_n=5):
     """
+    Extract top_n keywords from 'text' using a simple TF-IDF approach with
+    language detection and language-specific stopwords.
     """
+    # Clean the text (remove punctuation, lower the case, etc.)
     cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
+    # Detect language
+    lang_code = detect_language(cleaned_text)
+    # Get the relevant stopwords
+    language_stopwords = get_stopwords_for_language(lang_code)
+    # Initialize TF-IDF with the custom language stop words
+    vectorizer = TfidfVectorizer(stop_words=language_stopwords)
+    # We pass in a list of one "document" to TF-IDF
     tfidf_matrix = vectorizer.fit_transform([cleaned_text])
     feature_names = vectorizer.get_feature_names_out()
+    scores = tfidf_matrix.toarray()[0]  # row 0 since we only have one doc
+    # Pair (word, score), then sort descending by score
     word_score_pairs = list(zip(feature_names, scores))
     word_score_pairs.sort(key=lambda x: x[1], reverse=True)
     # Return just the top_n words
+    top_keywords = [word for (word, score) in word_score_pairs[:top_n]]
     return top_keywords