rajat5ranjan commited on
Commit
1dea348
·
verified ·
1 Parent(s): e44c761

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -2
app.py CHANGED
@@ -26,7 +26,7 @@ import altair as alt
26
  from GoogleNews import GoogleNews
27
  from bs4 import BeautifulSoup
28
  import requests
29
-
30
  st.set_page_config(layout="wide")
31
 
32
  GOOGLE_API_KEY=os.environ['GOOGLE_API_KEY']
@@ -424,6 +424,15 @@ elif activities=="News Sentiment":
424
 
425
  """
426
 
 
 
 
 
 
 
 
 
 
427
  def get_google_news_documents(query: str, max_articles: int = 10, timeout: int = 10):
428
  """
429
  Fetches news articles from Google News and returns a list of LangChain Document objects,
@@ -444,8 +453,11 @@ elif activities=="News Sentiment":
444
 
445
  documents = []
446
  for article in articles[:max_articles]:
447
- url = article.get("link")
 
 
448
  try:
 
449
  response = requests.get(url, timeout=timeout, headers={
450
  "User-Agent": "Mozilla/5.0"
451
  })
 
26
  from GoogleNews import GoogleNews
27
  from bs4 import BeautifulSoup
28
  import requests
29
+ from urllib.parse import urlparse, urlunparse
30
  st.set_page_config(layout="wide")
31
 
32
  GOOGLE_API_KEY=os.environ['GOOGLE_API_KEY']
 
424
 
425
  """
426
 
427
+
428
+ def clean_google_news_url(url: str) -> str:
429
+ """
430
+ Remove Google tracking parameters (&ved=..., &usg=..., etc.) from article URLs.
431
+ Keeps only the main part up to .html
432
+ """
433
+ if ".html" in url:
434
+ return url.split(".html")[0] + ".html"
435
+ return url
436
  def get_google_news_documents(query: str, max_articles: int = 10, timeout: int = 10):
437
  """
438
  Fetches news articles from Google News and returns a list of LangChain Document objects,
 
453
 
454
  documents = []
455
  for article in articles[:max_articles]:
456
+
457
+
458
+ url = clean_google_news_url(article.get("link"))
459
  try:
460
+ st.caption(f" Trying URL... {url}")
461
  response = requests.get(url, timeout=timeout, headers={
462
  "User-Agent": "Mozilla/5.0"
463
  })