Shreyas094 commited on
Commit
4b05267
·
verified ·
1 Parent(s): 4af5bad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -46
app.py CHANGED
@@ -365,15 +365,13 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
365
  model = get_model(temperature, top_p, repetition_penalty)
366
  embed = get_embeddings()
367
 
368
- if news_source == "Google News RSS":
369
- articles = fetch_google_news_rss(query)
370
- elif news_source == "Golomt Bank":
371
- articles = fetch_golomt_bank_news()
372
  else:
373
- return "Invalid news source selected."
374
 
375
  if not articles:
376
- return f"No news articles found for the given {news_source}."
377
 
378
  processed_articles = []
379
 
@@ -388,7 +386,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
388
 
389
  full_summary, cleaned_summary = summarize_news_content(clean_content, model)
390
  relevance_score = calculate_relevance_score(cleaned_summary, model)
391
- print(f"Relevance score for article '{article['title']}': {relevance_score}") # Debug print
392
 
393
  processed_article = {
394
  "published_date": article["published_date"],
@@ -403,11 +400,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
403
  except Exception as e:
404
  print(f"Error processing article: {str(e)}")
405
 
406
- # Debug print
407
- print("Processed articles:")
408
- for article in processed_articles:
409
- print(f"Title: {article['title']}, Score: {article['relevance_score']}")
410
-
411
  if not processed_articles:
412
  return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
413
 
@@ -430,46 +422,65 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
430
 
431
  # Update news_database for excel export
432
  global news_database
433
- news_database = processed_articles # Directly assign the processed articles
434
-
435
- print("Updated news_database:")
436
- for article in news_database:
437
- print(f"Title: {article['title']}, Score: {article['relevance_score']}")
438
 
439
  return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
440
  except Exception as e:
441
  return f"Error adding articles to the database: {str(e)}"
442
 
443
- def fetch_articles_from_page(url):
444
- response = requests.get(url)
445
- response.raise_for_status()
446
- soup = BeautifulSoup(response.content, 'html.parser')
447
- articles = soup.find_all('div', class_='entry-post gt-box-shadow-2')
448
- return articles, soup
449
-
450
- def fetch_articles_from_page(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  response = requests.get(url)
452
  response.raise_for_status()
453
  soup = BeautifulSoup(response.content, 'html.parser')
454
- articles = soup.find_all('div', class_='entry-post gt-box-shadow-2')
455
  return articles, soup
456
 
457
- def extract_articles(articles):
458
  article_data = []
459
  for article in articles:
460
- title_div = article.find('h2', class_='entry-title')
461
  title = title_div.get_text(strip=True) if title_div else "No Title"
462
- date_div = article.find('div', class_='entry-date gt-meta')
 
463
  date = date_div.get_text(strip=True) if date_div else "No Date"
464
- link_tag = article.find('a')
 
465
  link = link_tag['href'] if link_tag else "No Link"
466
  if not link.startswith('http'):
467
- link = "https://golomtbank.com" + link
 
468
  article_response = requests.get(link)
469
  article_response.raise_for_status()
470
  article_soup = BeautifulSoup(article_response.content, 'html.parser')
471
- article_content_div = article_soup.find('div', class_='entry-content')
472
  article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
 
473
  article_data.append({
474
  'title': title,
475
  'date': date,
@@ -478,30 +489,34 @@ def extract_articles(articles):
478
  })
479
  return article_data
480
 
481
- def fetch_golomt_bank_news(num_results=20):
482
- base_url = "https://golomtbank.com/en/rnews"
 
 
 
 
483
  current_page_url = base_url
484
  all_articles = []
485
 
486
  try:
487
  while len(all_articles) < num_results:
488
  print(f"Fetching articles from: {current_page_url}")
489
- articles, soup = fetch_articles_from_page(current_page_url)
490
  if not articles:
491
  print("No articles found on this page.")
492
  break
493
- all_articles.extend(extract_articles(articles))
494
  print(f"Total articles fetched so far: {len(all_articles)}")
495
  if len(all_articles) >= num_results:
496
  all_articles = all_articles[:num_results]
497
  break
498
- next_page_link = soup.find('a', class_='next')
499
  if not next_page_link:
500
  print("No next page link found.")
501
  break
502
  current_page_url = next_page_link['href']
503
  if not current_page_url.startswith('http'):
504
- current_page_url = "https://golomtbank.com" + current_page_url
505
 
506
  return [
507
  {
@@ -512,7 +527,7 @@ def fetch_golomt_bank_news(num_results=20):
512
  } for article in all_articles
513
  ]
514
  except Exception as e:
515
- print(f"Error fetching Golomt Bank news: {str(e)}")
516
  return []
517
 
518
  def export_news_to_excel():
@@ -763,18 +778,25 @@ with gr.Blocks() as demo:
763
  repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
764
  web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
765
  google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
766
-
767
  with gr.Row():
768
- news_source_dropdown = gr.Dropdown(
769
- choices=["Google News RSS", "Golomt Bank"],
770
- label="Select News Source",
771
- value="Google News RSS"
772
  )
773
- news_query_input = gr.Textbox(label="Enter news query (for Google News RSS)")
774
- fetch_news_button = gr.Button("Fetch News")
775
 
776
  news_fetch_output = gr.Textbox(label="News Fetch Status")
777
 
 
 
 
 
 
 
 
 
778
  def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
779
  answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
780
  history.append((question, answer))
 
365
  model = get_model(temperature, top_p, repetition_penalty)
366
  embed = get_embeddings()
367
 
368
+ if news_source in website_configs:
369
+ articles = fetch_news_from_website(news_source)
 
 
370
  else:
371
+ return f"Invalid news source selected: {news_source}"
372
 
373
  if not articles:
374
+ return f"No news articles found for {news_source}."
375
 
376
  processed_articles = []
377
 
 
386
 
387
  full_summary, cleaned_summary = summarize_news_content(clean_content, model)
388
  relevance_score = calculate_relevance_score(cleaned_summary, model)
 
389
 
390
  processed_article = {
391
  "published_date": article["published_date"],
 
400
  except Exception as e:
401
  print(f"Error processing article: {str(e)}")
402
 
 
 
 
 
 
403
  if not processed_articles:
404
  return f"Failed to process any news articles from {news_source}. Please try again or check the summarization process."
405
 
 
422
 
423
  # Update news_database for excel export
424
  global news_database
425
+ news_database = processed_articles
 
 
 
 
426
 
427
  return f"Processed and added {len(processed_articles)} news articles from {news_source} to the database."
428
  except Exception as e:
429
  return f"Error adding articles to the database: {str(e)}"
430
 
431
+ website_configs = {
432
+ "Golomt Bank": {
433
+ "base_url": "https://golomtbank.com/en/rnews",
434
+ "article_selector": 'div.entry-post.gt-box-shadow-2',
435
+ "title_selector": 'h2.entry-title',
436
+ "date_selector": 'div.entry-date.gt-meta',
437
+ "link_selector": 'a',
438
+ "content_selector": 'div.entry-content',
439
+ "next_page_selector": 'a.next',
440
+ "url_prefix": "https://golomtbank.com"
441
+ },
442
+ "Bank of America": {
443
+ "base_url": "https://newsroom.bankofamerica.com/content/newsroom/press-releases.html",
444
+ "article_selector": 'div.views-row',
445
+ "title_selector": 'span.field-content',
446
+ "date_selector": 'span.date-display-single',
447
+ "link_selector": 'a',
448
+ "content_selector": 'div.field-name-body',
449
+ "next_page_selector": 'li.pager-next a',
450
+ "url_prefix": "https://newsroom.bankofamerica.com"
451
+ },
452
+ # Add more banks as needed
453
+ }
454
+
455
+
456
+
457
+ def fetch_articles_from_page(url, config):
458
  response = requests.get(url)
459
  response.raise_for_status()
460
  soup = BeautifulSoup(response.content, 'html.parser')
461
+ articles = soup.find_all(config['article_selector'].split('.')[0], class_=config['article_selector'].split('.')[-1])
462
  return articles, soup
463
 
464
+ def extract_articles(articles, config):
465
  article_data = []
466
  for article in articles:
467
+ title_div = article.find(config['title_selector'].split('.')[0], class_=config['title_selector'].split('.')[-1])
468
  title = title_div.get_text(strip=True) if title_div else "No Title"
469
+
470
+ date_div = article.find(config['date_selector'].split('.')[0], class_=config['date_selector'].split('.')[-1])
471
  date = date_div.get_text(strip=True) if date_div else "No Date"
472
+
473
+ link_tag = article.find(config['link_selector'])
474
  link = link_tag['href'] if link_tag else "No Link"
475
  if not link.startswith('http'):
476
+ link = config['url_prefix'] + link
477
+
478
  article_response = requests.get(link)
479
  article_response.raise_for_status()
480
  article_soup = BeautifulSoup(article_response.content, 'html.parser')
481
+ article_content_div = article_soup.find(config['content_selector'].split('.')[0], class_=config['content_selector'].split('.')[-1])
482
  article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
483
+
484
  article_data.append({
485
  'title': title,
486
  'date': date,
 
489
  })
490
  return article_data
491
 
492
+ def fetch_news_from_website(website_key, num_results=20):
493
+ config = website_configs.get(website_key)
494
+ if not config:
495
+ return f"No configuration found for website: {website_key}"
496
+
497
+ base_url = config['base_url']
498
  current_page_url = base_url
499
  all_articles = []
500
 
501
  try:
502
  while len(all_articles) < num_results:
503
  print(f"Fetching articles from: {current_page_url}")
504
+ articles, soup = fetch_articles_from_page(current_page_url, config)
505
  if not articles:
506
  print("No articles found on this page.")
507
  break
508
+ all_articles.extend(extract_articles(articles, config))
509
  print(f"Total articles fetched so far: {len(all_articles)}")
510
  if len(all_articles) >= num_results:
511
  all_articles = all_articles[:num_results]
512
  break
513
+ next_page_link = soup.find(config['next_page_selector'])
514
  if not next_page_link:
515
  print("No next page link found.")
516
  break
517
  current_page_url = next_page_link['href']
518
  if not current_page_url.startswith('http'):
519
+ current_page_url = config['url_prefix'] + current_page_url
520
 
521
  return [
522
  {
 
527
  } for article in all_articles
528
  ]
529
  except Exception as e:
530
+ print(f"Error fetching news from {website_key}: {str(e)}")
531
  return []
532
 
533
  def export_news_to_excel():
 
778
  repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
779
  web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
780
  google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
781
+
782
  with gr.Row():
783
+ bank_dropdown = gr.Dropdown(
784
+ choices=list(website_configs.keys()),
785
+ label="Select Bank",
786
+ value=list(website_configs.keys())[0]
787
  )
788
+ fetch_news_button = gr.Button("Fetch Bank News")
 
789
 
790
  news_fetch_output = gr.Textbox(label="News Fetch Status")
791
 
792
+ submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, google_news_rss_checkbox], outputs=[question_input, chatbot])
793
+
794
+ fetch_news_button.click(
795
+ fetch_bank_news,
796
+ inputs=[bank_dropdown, temperature_slider, top_p_slider, repetition_penalty_slider],
797
+ outputs=news_fetch_output
798
+ )
799
+
800
  def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
801
  answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
802
  history.append((question, answer))