Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -365,15 +365,13 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
365 |
model = get_model(temperature, top_p, repetition_penalty)
|
366 |
embed = get_embeddings()
|
367 |
|
368 |
-
if news_source
|
369 |
-
articles =
|
370 |
-
elif news_source == "Golomt Bank":
|
371 |
-
articles = fetch_golomt_bank_news()
|
372 |
else:
|
373 |
-
return "Invalid news source selected
|
374 |
|
375 |
if not articles:
|
376 |
-
return f"No news articles found for
|
377 |
|
378 |
processed_articles = []
|
379 |
|
@@ -388,7 +386,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
388 |
|
389 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
390 |
relevance_score = calculate_relevance_score(cleaned_summary, model)
|
391 |
-
print(f"Relevance score for article '{article['title']}': {relevance_score}") # Debug print
|
392 |
|
393 |
processed_article = {
|
394 |
"published_date": article["published_date"],
|
@@ -403,11 +400,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
403 |
except Exception as e:
|
404 |
print(f"Error processing article: {str(e)}")
|
405 |
|
406 |
-
# Debug print
|
407 |
-
print("Processed articles:")
|
408 |
-
for article in processed_articles:
|
409 |
-
print(f"Title: {article['title']}, Score: {article['relevance_score']}")
|
410 |
-
|
411 |
if not processed_articles:
|
412 |
return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
|
413 |
|
@@ -430,46 +422,65 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
430 |
|
431 |
# Update news_database for excel export
|
432 |
global news_database
|
433 |
-
news_database = processed_articles
|
434 |
-
|
435 |
-
print("Updated news_database:")
|
436 |
-
for article in news_database:
|
437 |
-
print(f"Title: {article['title']}, Score: {article['relevance_score']}")
|
438 |
|
439 |
return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
|
440 |
except Exception as e:
|
441 |
return f"Error adding articles to the database: {str(e)}"
|
442 |
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
response = requests.get(url)
|
452 |
response.raise_for_status()
|
453 |
soup = BeautifulSoup(response.content, 'html.parser')
|
454 |
-
articles = soup.find_all('
|
455 |
return articles, soup
|
456 |
|
457 |
-
def extract_articles(articles):
|
458 |
article_data = []
|
459 |
for article in articles:
|
460 |
-
title_div = article.find('
|
461 |
title = title_div.get_text(strip=True) if title_div else "No Title"
|
462 |
-
|
|
|
463 |
date = date_div.get_text(strip=True) if date_div else "No Date"
|
464 |
-
|
|
|
465 |
link = link_tag['href'] if link_tag else "No Link"
|
466 |
if not link.startswith('http'):
|
467 |
-
link =
|
|
|
468 |
article_response = requests.get(link)
|
469 |
article_response.raise_for_status()
|
470 |
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
471 |
-
article_content_div = article_soup.find('
|
472 |
article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
|
|
|
473 |
article_data.append({
|
474 |
'title': title,
|
475 |
'date': date,
|
@@ -478,30 +489,34 @@ def extract_articles(articles):
|
|
478 |
})
|
479 |
return article_data
|
480 |
|
481 |
-
def
|
482 |
-
|
|
|
|
|
|
|
|
|
483 |
current_page_url = base_url
|
484 |
all_articles = []
|
485 |
|
486 |
try:
|
487 |
while len(all_articles) < num_results:
|
488 |
print(f"Fetching articles from: {current_page_url}")
|
489 |
-
articles, soup = fetch_articles_from_page(current_page_url)
|
490 |
if not articles:
|
491 |
print("No articles found on this page.")
|
492 |
break
|
493 |
-
all_articles.extend(extract_articles(articles))
|
494 |
print(f"Total articles fetched so far: {len(all_articles)}")
|
495 |
if len(all_articles) >= num_results:
|
496 |
all_articles = all_articles[:num_results]
|
497 |
break
|
498 |
-
next_page_link = soup.find('
|
499 |
if not next_page_link:
|
500 |
print("No next page link found.")
|
501 |
break
|
502 |
current_page_url = next_page_link['href']
|
503 |
if not current_page_url.startswith('http'):
|
504 |
-
current_page_url =
|
505 |
|
506 |
return [
|
507 |
{
|
@@ -512,7 +527,7 @@ def fetch_golomt_bank_news(num_results=20):
|
|
512 |
} for article in all_articles
|
513 |
]
|
514 |
except Exception as e:
|
515 |
-
print(f"Error fetching
|
516 |
return []
|
517 |
|
518 |
def export_news_to_excel():
|
@@ -763,18 +778,25 @@ with gr.Blocks() as demo:
|
|
763 |
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
764 |
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
765 |
google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
|
766 |
-
|
767 |
with gr.Row():
|
768 |
-
|
769 |
-
choices=
|
770 |
-
label="Select
|
771 |
-
value=
|
772 |
)
|
773 |
-
|
774 |
-
fetch_news_button = gr.Button("Fetch News")
|
775 |
|
776 |
news_fetch_output = gr.Textbox(label="News Fetch Status")
|
777 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
778 |
def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
779 |
answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
|
780 |
history.append((question, answer))
|
|
|
365 |
model = get_model(temperature, top_p, repetition_penalty)
|
366 |
embed = get_embeddings()
|
367 |
|
368 |
+
if news_source in website_configs:
|
369 |
+
articles = fetch_news_from_website(news_source)
|
|
|
|
|
370 |
else:
|
371 |
+
return f"Invalid news source selected: {news_source}"
|
372 |
|
373 |
if not articles:
|
374 |
+
return f"No news articles found for {news_source}."
|
375 |
|
376 |
processed_articles = []
|
377 |
|
|
|
386 |
|
387 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
388 |
relevance_score = calculate_relevance_score(cleaned_summary, model)
|
|
|
389 |
|
390 |
processed_article = {
|
391 |
"published_date": article["published_date"],
|
|
|
400 |
except Exception as e:
|
401 |
print(f"Error processing article: {str(e)}")
|
402 |
|
|
|
|
|
|
|
|
|
|
|
403 |
if not processed_articles:
|
404 |
return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
|
405 |
|
|
|
422 |
|
423 |
# Update news_database for excel export
|
424 |
global news_database
|
425 |
+
news_database = processed_articles
|
|
|
|
|
|
|
|
|
426 |
|
427 |
return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
|
428 |
except Exception as e:
|
429 |
return f"Error adding articles to the database: {str(e)}"
|
430 |
|
431 |
+
website_configs = {
|
432 |
+
"Golomt Bank": {
|
433 |
+
"base_url": "https://golomtbank.com/en/rnews",
|
434 |
+
"article_selector": 'div.entry-post.gt-box-shadow-2',
|
435 |
+
"title_selector": 'h2.entry-title',
|
436 |
+
"date_selector": 'div.entry-date.gt-meta',
|
437 |
+
"link_selector": 'a',
|
438 |
+
"content_selector": 'div.entry-content',
|
439 |
+
"next_page_selector": 'a.next',
|
440 |
+
"url_prefix": "https://golomtbank.com"
|
441 |
+
},
|
442 |
+
"Bank of America": {
|
443 |
+
"base_url": "https://newsroom.bankofamerica.com/content/newsroom/press-releases.html",
|
444 |
+
"article_selector": 'div.views-row',
|
445 |
+
"title_selector": 'span.field-content',
|
446 |
+
"date_selector": 'span.date-display-single',
|
447 |
+
"link_selector": 'a',
|
448 |
+
"content_selector": 'div.field-name-body',
|
449 |
+
"next_page_selector": 'li.pager-next a',
|
450 |
+
"url_prefix": "https://newsroom.bankofamerica.com"
|
451 |
+
},
|
452 |
+
# Add more banks as needed
|
453 |
+
}
|
454 |
+
|
455 |
+
|
456 |
+
|
457 |
+
def fetch_articles_from_page(url, config):
|
458 |
response = requests.get(url)
|
459 |
response.raise_for_status()
|
460 |
soup = BeautifulSoup(response.content, 'html.parser')
|
461 |
+
articles = soup.find_all(config['article_selector'].split('.')[0], class_=config['article_selector'].split('.')[-1])
|
462 |
return articles, soup
|
463 |
|
464 |
+
def extract_articles(articles, config):
|
465 |
article_data = []
|
466 |
for article in articles:
|
467 |
+
title_div = article.find(config['title_selector'].split('.')[0], class_=config['title_selector'].split('.')[-1])
|
468 |
title = title_div.get_text(strip=True) if title_div else "No Title"
|
469 |
+
|
470 |
+
date_div = article.find(config['date_selector'].split('.')[0], class_=config['date_selector'].split('.')[-1])
|
471 |
date = date_div.get_text(strip=True) if date_div else "No Date"
|
472 |
+
|
473 |
+
link_tag = article.find(config['link_selector'])
|
474 |
link = link_tag['href'] if link_tag else "No Link"
|
475 |
if not link.startswith('http'):
|
476 |
+
link = config['url_prefix'] + link
|
477 |
+
|
478 |
article_response = requests.get(link)
|
479 |
article_response.raise_for_status()
|
480 |
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
481 |
+
article_content_div = article_soup.find(config['content_selector'].split('.')[0], class_=config['content_selector'].split('.')[-1])
|
482 |
article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
|
483 |
+
|
484 |
article_data.append({
|
485 |
'title': title,
|
486 |
'date': date,
|
|
|
489 |
})
|
490 |
return article_data
|
491 |
|
492 |
+
def fetch_news_from_website(website_key, num_results=20):
|
493 |
+
config = website_configs.get(website_key)
|
494 |
+
if not config:
|
495 |
+
return f"No configuration found for website: {website_key}"
|
496 |
+
|
497 |
+
base_url = config['base_url']
|
498 |
current_page_url = base_url
|
499 |
all_articles = []
|
500 |
|
501 |
try:
|
502 |
while len(all_articles) < num_results:
|
503 |
print(f"Fetching articles from: {current_page_url}")
|
504 |
+
articles, soup = fetch_articles_from_page(current_page_url, config)
|
505 |
if not articles:
|
506 |
print("No articles found on this page.")
|
507 |
break
|
508 |
+
all_articles.extend(extract_articles(articles, config))
|
509 |
print(f"Total articles fetched so far: {len(all_articles)}")
|
510 |
if len(all_articles) >= num_results:
|
511 |
all_articles = all_articles[:num_results]
|
512 |
break
|
513 |
+
next_page_link = soup.find(config['next_page_selector'])
|
514 |
if not next_page_link:
|
515 |
print("No next page link found.")
|
516 |
break
|
517 |
current_page_url = next_page_link['href']
|
518 |
if not current_page_url.startswith('http'):
|
519 |
+
current_page_url = config['url_prefix'] + current_page_url
|
520 |
|
521 |
return [
|
522 |
{
|
|
|
527 |
} for article in all_articles
|
528 |
]
|
529 |
except Exception as e:
|
530 |
+
print(f"Error fetching news from {website_key}: {str(e)}")
|
531 |
return []
|
532 |
|
533 |
def export_news_to_excel():
|
|
|
778 |
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
779 |
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
780 |
google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
|
781 |
+
|
782 |
with gr.Row():
|
783 |
+
bank_dropdown = gr.Dropdown(
|
784 |
+
choices=list(website_configs.keys()),
|
785 |
+
label="Select Bank",
|
786 |
+
value=list(website_configs.keys())[0]
|
787 |
)
|
788 |
+
fetch_news_button = gr.Button("Fetch Bank News")
|
|
|
789 |
|
790 |
news_fetch_output = gr.Textbox(label="News Fetch Status")
|
791 |
|
792 |
+
submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, google_news_rss_checkbox], outputs=[question_input, chatbot])
|
793 |
+
|
794 |
+
fetch_news_button.click(
|
795 |
+
fetch_bank_news,
|
796 |
+
inputs=[bank_dropdown, temperature_slider, top_p_slider, repetition_penalty_slider],
|
797 |
+
outputs=news_fetch_output
|
798 |
+
)
|
799 |
+
|
800 |
def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
801 |
answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
|
802 |
history.append((question, answer))
|