File size: 1,214 Bytes
06bd223
daaad57
0059f4a
06bd223
 
0059f4a
daaad57
 
 
3a726ab
daaad57
0059f4a
daaad57
3a726ab
 
daaad57
3a726ab
daaad57
0059f4a
06bd223
3a726ab
0059f4a
daaad57
 
3a726ab
 
daaad57
3a726ab
06bd223
3a726ab
daaad57
3a726ab
06bd223
 
daaad57
3a726ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
import langdetect
from stopwordsiso import stopwords, has_lang
from sklearn.feature_extraction.text import TfidfVectorizer

def detect_language(text: str) -> str:
    try:
        return langdetect.detect(text)
    except:
        return "en"  # fallback if detection fails

def get_stopwords_for_language(lang_code: str):
    lang_code = lang_code.lower()
    if has_lang(lang_code):
        return stopwords(lang_code)  # returns a *set* of stopwords
    else:
        return set()

def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
    cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())

    lang_code = detect_language(cleaned_text)
    language_stopwords = get_stopwords_for_language(lang_code)

    # Convert the set to a list here!
    vectorizer = TfidfVectorizer(stop_words=list(language_stopwords))

    tfidf_matrix = vectorizer.fit_transform([cleaned_text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]

    # Pair up (word, score), then sort descending
    word_score_pairs = list(zip(feature_names, scores))
    word_score_pairs.sort(key=lambda x: x[1], reverse=True)

    return [w for (w, _) in word_score_pairs[:top_n]]