awacke1 commited on
Commit
b997fd9
·
verified ·
1 Parent(s): 71c8ee5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -0
app.py CHANGED
@@ -59,7 +59,62 @@ FILE_EMOJIS = {
59
  "mp3": "🎵",
60
  }
61
 
 
 
62
  def get_high_info_terms(text: str) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  stop_words = set([
64
  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
65
  'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
 
59
  "mp3": "🎵",
60
  }
61
 
62
+
63
+
64
  def get_high_info_terms(text: str) -> list:
65
+ # Expanded stop words
66
+ stop_words = set([
67
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
68
+ 'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
69
+ 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
70
+ 'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
71
+ 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
72
+ 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
73
+ 'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there', 'as', 'if', 'while'
74
+ ])
75
+
76
+ # Key phrases tailored to your interests
77
+ key_phrases = [
78
+ 'artificial intelligence', 'machine learning', 'deep learning', 'neural networks',
79
+ 'natural language processing', 'healthcare systems', 'clinical medicine',
80
+ 'genomics', 'biological systems', 'cognitive science', 'data visualization',
81
+ 'wellness technology', 'robotics', 'medical imaging', 'semantic understanding',
82
+ 'transformers', 'large language models', 'empirical studies', 'scientific research',
83
+ 'quantum mechanics', 'biomedical engineering', 'computational biology'
84
+ ]
85
+
86
+ # Preserve key phrases and remove them from the text
87
+ preserved_phrases = []
88
+ lower_text = text.lower()
89
+ for phrase in key_phrases:
90
+ if phrase in lower_text:
91
+ preserved_phrases.append(phrase)
92
+ text = text.replace(phrase, '')
93
+ break # Stop after the first matching key phrase
94
+
95
+ # Extract words and filter high-info terms
96
+ words = re.findall(r'\b\w+(?:-\w+)*\b', text)
97
+ high_info_words = [
98
+ word.lower() for word in words
99
+ if len(word) > 3
100
+ and word.lower() not in stop_words
101
+ and not word.isdigit()
102
+ and any(c.isalpha() for c in word)
103
+ ]
104
+
105
+ # Combine preserved phrases and filtered words, ensuring uniqueness
106
+ unique_terms = []
107
+ seen = set()
108
+ for term in preserved_phrases + high_info_words:
109
+ if term not in seen:
110
+ seen.add(term)
111
+ unique_terms.append(term)
112
+
113
+ # Return only the top 5 terms
114
+ return unique_terms[:5]
115
+
116
+
117
+ def get_high_info_terms_old(text: str) -> list:
118
  stop_words = set([
119
  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
120
  'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',