Update app.py
Browse files
app.py
CHANGED
|
@@ -26,7 +26,7 @@ import altair as alt
|
|
| 26 |
from GoogleNews import GoogleNews
|
| 27 |
from bs4 import BeautifulSoup
|
| 28 |
import requests
|
| 29 |
-
|
| 30 |
st.set_page_config(layout="wide")
|
| 31 |
|
| 32 |
GOOGLE_API_KEY=os.environ['GOOGLE_API_KEY']
|
|
@@ -424,6 +424,15 @@ elif activities=="News Sentiment":
|
|
| 424 |
|
| 425 |
"""
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
def get_google_news_documents(query: str, max_articles: int = 10, timeout: int = 10):
|
| 428 |
"""
|
| 429 |
Fetches news articles from Google News and returns a list of LangChain Document objects,
|
|
@@ -444,8 +453,11 @@ elif activities=="News Sentiment":
|
|
| 444 |
|
| 445 |
documents = []
|
| 446 |
for article in articles[:max_articles]:
|
| 447 |
-
|
|
|
|
|
|
|
| 448 |
try:
|
|
|
|
| 449 |
response = requests.get(url, timeout=timeout, headers={
|
| 450 |
"User-Agent": "Mozilla/5.0"
|
| 451 |
})
|
|
|
|
| 26 |
from GoogleNews import GoogleNews
|
| 27 |
from bs4 import BeautifulSoup
|
| 28 |
import requests
|
| 29 |
+
from urllib.parse import urlparse, urlunparse
|
| 30 |
st.set_page_config(layout="wide")
|
| 31 |
|
| 32 |
GOOGLE_API_KEY=os.environ['GOOGLE_API_KEY']
|
|
|
|
| 424 |
|
| 425 |
"""
|
| 426 |
|
| 427 |
+
|
| 428 |
+
def clean_google_news_url(url: str) -> str:
|
| 429 |
+
"""
|
| 430 |
+
Remove Google tracking parameters (&ved=..., &usg=..., etc.) from article URLs.
|
| 431 |
+
Keeps only the main part up to .html
|
| 432 |
+
"""
|
| 433 |
+
if ".html" in url:
|
| 434 |
+
return url.split(".html")[0] + ".html"
|
| 435 |
+
return url
|
| 436 |
def get_google_news_documents(query: str, max_articles: int = 10, timeout: int = 10):
|
| 437 |
"""
|
| 438 |
Fetches news articles from Google News and returns a list of LangChain Document objects,
|
|
|
|
| 453 |
|
| 454 |
documents = []
|
| 455 |
for article in articles[:max_articles]:
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
url = clean_google_news_url(article.get("link"))
|
| 459 |
try:
|
| 460 |
+
st.caption(f" Trying URL... {url}")
|
| 461 |
response = requests.get(url, timeout=timeout, headers={
|
| 462 |
"User-Agent": "Mozilla/5.0"
|
| 463 |
})
|