broadfield-dev commited on
Commit
b9891ea
·
verified ·
1 Parent(s): cf1fb4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -256
app.py CHANGED
@@ -1,155 +1,9 @@
1
  import os
2
- import feedparser
3
  from flask import Flask, render_template, request
4
- from huggingface_hub import HfApi, InferenceClient
5
- from langchain.vectorstores import Chroma
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.docstore.document import Document
8
- import requests
9
- import shutil
10
 
11
- # Flask app setup
12
  app = Flask(__name__)
13
 
14
- # Hugging Face setup
15
- HF_API_TOKEN = os.getenv("HF_API_TOKEN", "YOUR_HF_API_TOKEN")
16
- HF_MODEL = "Qwen/Qwen-72B-Instruct"
17
- REPO_ID = "broadfield-dev/news-rag-db"
18
- LOCAL_DB_DIR = "chroma_db"
19
- client = InferenceClient(model=HF_MODEL, token=HF_API_TOKEN)
20
-
21
- # Comprehensive RSS feeds
22
- RSS_FEEDS = [
23
- "https://www.sciencedaily.com/rss/top/science.xml",
24
- "https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx",
25
- "http://rss.cnn.com/rss/cnn_allpolitics.rss",
26
- "https://phys.org/rss-feed/physics-news/",
27
- "https://www.spaceweatherlive.com/en/news/rss",
28
- "https://weather.com/feeds/rss",
29
- "https://www.wired.com/feed/rss",
30
- "https://www.nasa.gov/rss/dyn/breaking_news.rss",
31
- "https://www.nationalgeographic.com/feed/",
32
- "https://www.nature.com/nature.rss",
33
- "https://www.scientificamerican.com/rss/",
34
- "https://www.newscientist.com/feed/home/",
35
- "https://www.livescience.com/feeds/all",
36
- "https://www.hindustantimes.com/feed/horoscope/rss",
37
- "https://www.washingtonpost.com/wp-srv/style/horoscopes/rss.xml",
38
- "https://astrostyle.com/feed/",
39
- "https://www.vogue.com/feed/rss",
40
- "https://feeds.bbci.co.uk/news/politics/rss.xml",
41
- "https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml",
42
- "https://www.politico.com/rss/politics.xml",
43
- "https://thehill.com/feed/",
44
- "https://www.aps.org/publications/apsnews/updates/rss.cfm",
45
- "https://www.quantamagazine.org/feed/",
46
- "https://www.sciencedaily.com/rss/matter_energy/physics.xml",
47
- "https://physicsworld.com/feed/",
48
- "https://www.swpc.noaa.gov/rss.xml",
49
- "https://www.nasa.gov/rss/dyn/solar_system.rss",
50
- "https://weather.com/science/space/rss",
51
- "https://www.space.com/feeds/space-weather",
52
- "https://www.accuweather.com/en/rss",
53
- "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/",
54
- "https://www.weather.gov/rss",
55
- "https://www.foxweather.com/rss",
56
- "https://techcrunch.com/feed/",
57
- "https://arstechnica.com/feed/",
58
- "https://gizmodo.com/rss",
59
- "https://www.theverge.com/rss/index.xml",
60
- "https://www.space.com/feeds/all",
61
- "https://www.universetoday.com/feed/",
62
- "https://skyandtelescope.org/feed/",
63
- "https://www.esa.int/rss",
64
- "https://www.smithsonianmag.com/rss/",
65
- "https://www.popsci.com/rss.xml",
66
- "https://www.discovermagazine.com/rss",
67
- "https://www.atlasobscura.com/feeds/latest"
68
- ]
69
-
70
- # Embedding model
71
- embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
72
- vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
73
- hf_api = HfApi()
74
-
75
- def fetch_rss_feeds():
76
- articles = []
77
- for feed_url in RSS_FEEDS:
78
- feed = feedparser.parse(feed_url)
79
- for entry in feed.entries[:5]: # Limit to 5 per feed
80
- articles.append({
81
- "title": entry.get("title", "No Title"),
82
- "link": entry.get("link", ""),
83
- "description": entry.get("summary", entry.get("description", "No Description")),
84
- "published": entry.get("published", "Unknown Date"),
85
- "category": categorize_feed(feed_url),
86
- })
87
- return articles
88
-
89
- def categorize_feed(url):
90
- if "sciencedaily" in url or "phys.org" in url:
91
- return "Science & Physics"
92
- elif "horoscope" in url:
93
- return "Astrology"
94
- elif "politics" in url:
95
- return "Politics"
96
- elif "spaceweather" in url or "nasa" in url:
97
- return "Solar & Space"
98
- elif "weather" in url:
99
- return "Earth Weather"
100
- else:
101
- return "Cool Stuff"
102
-
103
- def summarize_article(text):
104
- prompt = f"Summarize the following text concisely:\n\n{text}"
105
- response = client.text_generation(prompt, max_new_tokens=100, temperature=0.7)
106
- return response.strip()
107
-
108
- def categorize_article(text):
109
- prompt = f"Classify the sentiment as positive, negative, or neutral:\n\n{text}"
110
- response = client.text_generation(prompt, max_new_tokens=10, temperature=0.7)
111
- return response.strip()
112
-
113
- def process_and_store_articles(articles):
114
- documents = []
115
- for article in articles:
116
- summary = summarize_article(article["description"])
117
- sentiment = categorize_article(article["description"])
118
- doc = Document(
119
- page_content=summary,
120
- metadata={
121
- "title": article["title"],
122
- "link": article["link"],
123
- "original_description": article["description"],
124
- "published": article["published"],
125
- "category": article["category"],
126
- "sentiment": sentiment,
127
- }
128
- )
129
- documents.append(doc)
130
- vector_db.add_documents(documents)
131
- vector_db.persist()
132
- upload_to_hf_hub()
133
-
134
- def upload_to_hf_hub():
135
- if os.path.exists(LOCAL_DB_DIR):
136
- try:
137
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True)
138
- except Exception as e:
139
- print(f"Error creating repo: {e}")
140
- for root, _, files in os.walk(LOCAL_DB_DIR):
141
- for file in files:
142
- local_path = os.path.join(root, file)
143
- remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
144
- hf_api.upload_file(
145
- path_or_fileobj=local_path,
146
- path_in_repo=remote_path,
147
- repo_id=REPO_ID,
148
- repo_type="dataset",
149
- token=HF_API_TOKEN
150
- )
151
- print(f"Database uploaded to: {REPO_ID}")
152
-
153
  @app.route('/', methods=['GET', 'POST'])
154
  def index():
155
  articles = fetch_rss_feeds()
@@ -192,113 +46,5 @@ def index():
192
 
193
  return render_template("index.html", categorized_articles=categorized_articles)
194
 
195
- HTML_TEMPLATE = """
196
- <!DOCTYPE html>
197
- <html lang="en">
198
- <head>
199
- <meta charset="UTF-8">
200
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
201
- <title>News Feed Hub</title>
202
- <style>
203
- body {
204
- font-family: 'Arial', sans-serif;
205
- margin: 0;
206
- padding: 20px;
207
- background-color: #f4f4f9;
208
- color: #333;
209
- }
210
- h1 {
211
- text-align: center;
212
- color: #2c3e50;
213
- }
214
- .search-container {
215
- text-align: center;
216
- margin: 20px 0;
217
- }
218
- .search-bar {
219
- width: 50%;
220
- padding: 12px;
221
- font-size: 16px;
222
- border: 2px solid #3498db;
223
- border-radius: 25px;
224
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
225
- outline: none;
226
- transition: border-color 0.3s;
227
- }
228
- .search-bar:focus {
229
- border-color: #2980b9;
230
- }
231
- .category-section {
232
- margin: 30px 0;
233
- }
234
- .category-title {
235
- background-color: #3498db;
236
- color: white;
237
- padding: 10px;
238
- border-radius: 5px;
239
- font-size: 1.4em;
240
- }
241
- .article {
242
- background-color: white;
243
- padding: 15px;
244
- margin: 10px 0;
245
- border-radius: 8px;
246
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
247
- transition: transform 0.2s;
248
- }
249
- .article:hover {
250
- transform: translateY(-3px);
251
- }
252
- .title a {
253
- font-size: 1.2em;
254
- color: #2c3e50;
255
- text-decoration: none;
256
- }
257
- .title a:hover {
258
- color: #3498db;
259
- }
260
- .summary {
261
- color: #555;
262
- margin: 5px 0;
263
- }
264
- .sentiment {
265
- font-style: italic;
266
- color: #7f8c8d;
267
- }
268
- .published {
269
- font-size: 0.9em;
270
- color: #95a5a6;
271
- }
272
- </style>
273
- </head>
274
- <body>
275
- <h1>News Feed Hub</h1>
276
- <div class="search-container">
277
- <form method="POST">
278
- <input type="text" name="search" class="search-bar" placeholder="Search news semantically...">
279
- </form>
280
- </div>
281
- {% for category, articles in categorized_articles.items() %}
282
- <div class="category-section">
283
- <div class="category-title">{{ category }}</div>
284
- {% for article in articles %}
285
- <div class="article">
286
- <div class="title"><a href="{{ article.link }}" target="_blank">{{ article.title }}</a></div>
287
- <div class="summary">{{ article.summary }}</div>
288
- <div class="sentiment">Sentiment: {{ article.sentiment }}</div>
289
- <div class="published">Published: {{ article.published }}</div>
290
- </div>
291
- {% endfor %}
292
- </div>
293
- {% endfor %}
294
- </body>
295
- </html>
296
- """
297
-
298
  if __name__ == "__main__":
299
- os.makedirs("templates", exist_ok=True)
300
- with open("templates/index.html", "w") as f:
301
- f.write(HTML_TEMPLATE)
302
- if os.path.exists(LOCAL_DB_DIR):
303
- shutil.rmtree(LOCAL_DB_DIR)
304
- app.run(debug=True, host="0.0.0.0", port=7860)
 
1
  import os
 
2
  from flask import Flask, render_template, request
3
+ from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
 
 
 
 
 
4
 
 
5
  app = Flask(__name__)
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  @app.route('/', methods=['GET', 'POST'])
8
  def index():
9
  articles = fetch_rss_feeds()
 
46
 
47
  return render_template("index.html", categorized_articles=categorized_articles)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if __name__ == "__main__":
50
+ app.run(host="0.0.0.0", port=7860)