Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,915 Bytes
06bd223 daaad57 0059f4a 06bd223 0059f4a 139362e daaad57 0059f4a daaad57 0059f4a daaad57 0059f4a daaad57 0059f4a daaad57 0059f4a daaad57 0059f4a 06bd223 0059f4a 06bd223 0059f4a 06bd223 139362e 0059f4a daaad57 0059f4a daaad57 06bd223 daaad57 06bd223 0059f4a daaad57 0059f4a 06bd223 daaad57 0059f4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import re
import langdetect
from stopwordsiso import stopwords, has_lang
from sklearn.feature_extraction.text import TfidfVectorizer
def detect_language(text: str) -> str:
"""Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es')."""
try:
return langdetect.detect(text)
except:
# If detection fails or is uncertain, default to English
return 'en'
def get_stopwords_for_language(lang_code: str):
"""
Retrieve stopwords from stopwordsiso for a given language code.
If not available, fallback to an empty set.
"""
lang_code = lang_code.lower()
if has_lang(lang_code):
# has_lang(lang_code) checks if stopwordsiso supports that code
return stopwords(lang_code) # returns a set of stopwords
else:
return set() # fallback if the language is unsupported
def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
"""
Extract top_n keywords from 'text' using TF-IDF,
language detection, and language-specific stopwords.
"""
# Basic cleanup: remove punctuation, lower the case, etc.
cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
# Detect language and get appropriate stopwords
lang_code = detect_language(cleaned_text)
language_stopwords = get_stopwords_for_language(lang_code)
# Build TF-IDF vectorizer with custom stopwords
vectorizer = TfidfVectorizer(stop_words=language_stopwords)
tfidf_matrix = vectorizer.fit_transform([cleaned_text])
feature_names = vectorizer.get_feature_names_out()
scores = tfidf_matrix.toarray()[0] # only 1 row, since we have 1 doc
# Pair (word, score) and sort descending by score
word_score_pairs = list(zip(feature_names, scores))
word_score_pairs.sort(key=lambda x: x[1], reverse=True)
# Return the top N words
return [word for (word, _) in word_score_pairs[:top_n]]
|