File size: 1,915 Bytes
06bd223
daaad57
0059f4a
06bd223
 
0059f4a
139362e
daaad57
 
 
0059f4a
 
daaad57
0059f4a
daaad57
 
0059f4a
daaad57
 
0059f4a
 
 
daaad57
0059f4a
daaad57
0059f4a
06bd223
0059f4a
 
06bd223
0059f4a
06bd223
139362e
0059f4a
 
daaad57
 
0059f4a
daaad57
06bd223
daaad57
06bd223
0059f4a
daaad57
0059f4a
06bd223
 
daaad57
0059f4a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
import langdetect
from stopwordsiso import stopwords, has_lang
from sklearn.feature_extraction.text import TfidfVectorizer

def detect_language(text: str) -> str:
    """Detect language using langdetect; returns a language code (e.g. 'en', 'de', 'es')."""
    try:
        return langdetect.detect(text)
    except:
        # If detection fails or is uncertain, default to English
        return 'en'

def get_stopwords_for_language(lang_code: str):
    """
    Retrieve stopwords from stopwordsiso for a given language code.
    If not available, fallback to an empty set.
    """
    lang_code = lang_code.lower()
    if has_lang(lang_code):  
        # has_lang(lang_code) checks if stopwordsiso supports that code
        return stopwords(lang_code)  # returns a set of stopwords
    else:
        return set()  # fallback if the language is unsupported

def extract_top_keywords(text: str, top_n: int = 5) -> list[str]:
    """
    Extract top_n keywords from 'text' using TF-IDF, 
    language detection, and language-specific stopwords.
    """
    # Basic cleanup: remove punctuation, lower the case, etc.
    cleaned_text = re.sub(r"[^\w\s]", " ", text.lower())
    
    # Detect language and get appropriate stopwords
    lang_code = detect_language(cleaned_text)
    language_stopwords = get_stopwords_for_language(lang_code)

    # Build TF-IDF vectorizer with custom stopwords
    vectorizer = TfidfVectorizer(stop_words=language_stopwords)
    tfidf_matrix = vectorizer.fit_transform([cleaned_text])

    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]  # only 1 row, since we have 1 doc

    # Pair (word, score) and sort descending by score
    word_score_pairs = list(zip(feature_names, scores))
    word_score_pairs.sort(key=lambda x: x[1], reverse=True)

    # Return the top N words
    return [word for (word, _) in word_score_pairs[:top_n]]