Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -59,7 +59,62 @@ FILE_EMOJIS = {
|
|
59 |
"mp3": "🎵",
|
60 |
}
|
61 |
|
|
|
|
|
62 |
def get_high_info_terms(text: str) -> list:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
stop_words = set([
|
64 |
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
65 |
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|
|
|
59 |
"mp3": "🎵",
|
60 |
}
|
61 |
|
62 |
+
|
63 |
+
|
64 |
def get_high_info_terms(text: str) -> list:
|
65 |
+
# Expanded stop words
|
66 |
+
stop_words = set([
|
67 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
68 |
+
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|
69 |
+
'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
70 |
+
'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
|
71 |
+
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
|
72 |
+
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
|
73 |
+
'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there', 'as', 'if', 'while'
|
74 |
+
])
|
75 |
+
|
76 |
+
# Key phrases tailored to your interests
|
77 |
+
key_phrases = [
|
78 |
+
'artificial intelligence', 'machine learning', 'deep learning', 'neural networks',
|
79 |
+
'natural language processing', 'healthcare systems', 'clinical medicine',
|
80 |
+
'genomics', 'biological systems', 'cognitive science', 'data visualization',
|
81 |
+
'wellness technology', 'robotics', 'medical imaging', 'semantic understanding',
|
82 |
+
'transformers', 'large language models', 'empirical studies', 'scientific research',
|
83 |
+
'quantum mechanics', 'biomedical engineering', 'computational biology'
|
84 |
+
]
|
85 |
+
|
86 |
+
# Preserve key phrases and remove them from the text
|
87 |
+
preserved_phrases = []
|
88 |
+
lower_text = text.lower()
|
89 |
+
for phrase in key_phrases:
|
90 |
+
if phrase in lower_text:
|
91 |
+
preserved_phrases.append(phrase)
|
92 |
+
text = text.replace(phrase, '')
|
93 |
+
break # Stop after the first matching key phrase
|
94 |
+
|
95 |
+
# Extract words and filter high-info terms
|
96 |
+
words = re.findall(r'\b\w+(?:-\w+)*\b', text)
|
97 |
+
high_info_words = [
|
98 |
+
word.lower() for word in words
|
99 |
+
if len(word) > 3
|
100 |
+
and word.lower() not in stop_words
|
101 |
+
and not word.isdigit()
|
102 |
+
and any(c.isalpha() for c in word)
|
103 |
+
]
|
104 |
+
|
105 |
+
# Combine preserved phrases and filtered words, ensuring uniqueness
|
106 |
+
unique_terms = []
|
107 |
+
seen = set()
|
108 |
+
for term in preserved_phrases + high_info_words:
|
109 |
+
if term not in seen:
|
110 |
+
seen.add(term)
|
111 |
+
unique_terms.append(term)
|
112 |
+
|
113 |
+
# Return only the top 5 terms
|
114 |
+
return unique_terms[:5]
|
115 |
+
|
116 |
+
|
117 |
+
def get_high_info_terms_old(text: str) -> list:
|
118 |
stop_words = set([
|
119 |
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
120 |
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|