Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,214 Bytes
06bd223 daaad57 0059f4a 06bd223 0059f4a daaad57 3a726ab daaad57 0059f4a daaad57 3a726ab daaad57 3a726ab daaad57 0059f4a 06bd223 3a726ab 0059f4a daaad57 3a726ab daaad57 3a726ab 06bd223 3a726ab daaad57 3a726ab 06bd223 daaad57 3a726ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import re
import langdetect
from stopwordsiso import stopwords, has_lang
from sklearn.feature_extraction.text import TfidfVectorizer
def detect_language(text: str) -> str:
try:
return langdetect.detect(text)
except:
return "en" # fallback if detection fails
def get_stopwords_for_language(lang_code: str):
lang_code = lang_code.lower()
if has_lang(lang_code):
return stopwords(lang_code) # returns a *set* of stopwords
else:
return set()
def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
lang_code = detect_language(cleaned_text)
language_stopwords = get_stopwords_for_language(lang_code)
# Convert the set to a list here!
vectorizer = TfidfVectorizer(stop_words=list(language_stopwords))
tfidf_matrix = vectorizer.fit_transform([cleaned_text])
feature_names = vectorizer.get_feature_names_out()
scores = tfidf_matrix.toarray()[0]
# Pair up (word, score), then sort descending
word_score_pairs = list(zip(feature_names, scores))
word_score_pairs.sort(key=lambda x: x[1], reverse=True)
return [w for (w, _) in word_score_pairs[:top_n]]
|