rajat5ranjan commited on
Commit
92eaa07
·
verified ·
1 Parent(s): dd363c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -70
app.py CHANGED
@@ -45,81 +45,81 @@ llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro",google_api_key = GOOGLE_API_
45
 
46
  activities = st.sidebar.selectbox("Select", ["Symbol Analysis", "News Sentiment"])
47
 
48
- def clean_google_news_url(url: str):
49
- """
50
- Cleans Google News redirect URLs by removing tracking parameters like &ved= and &usg=.
51
- Keeps content up to .html or .cms.
52
- """
53
- for ext in [".html", ".cms"]:
54
- if ext in url:
55
- return url.split(ext)[0] + ext
56
- return url.split("&")[0] # fallback
57
- def get_google_news_documents(query: str, max_articles: int = 10, timeout: int = 10):
58
- """
59
- Fetches news articles from Google News and returns a list of LangChain Document objects,
60
- using requests + BeautifulSoup instead of newspaper3k.
61
-
62
- Args:
63
- query (str): Search query for Google News.
64
- max_articles (int): Number of articles to fetch.
65
- timeout (int): Timeout for HTTP requests.
 
 
 
 
 
66
 
67
- Returns:
68
- List[Document]: Parsed article content as LangChain Document objects.
69
- """
70
- st.caption(f"Fetching articles for query: '{query}'")
71
-
72
- googlenews = GoogleNews(lang="en")
73
- # Set time range to last `days` days
74
- end_date = datetime.today()
75
- days = 2
76
- start_date = end_date - timedelta(days=days)
77
- googlenews.set_time_range(start_date.strftime("%m/%d/%Y"), end_date.strftime("%m/%d/%Y"))
78
 
79
- googlenews.search(query)
80
- articles = googlenews.result()
 
 
 
 
81
 
82
- documents = []
83
- i=1
84
- for article in articles:
85
 
86
-
87
- url = clean_google_news_url(article.get("link"))
88
- try:
89
- with st.spinner(f" Trying URL... {url}"):
90
- # st.caption()
91
- response = requests.get(url, timeout=timeout, headers={
92
- "User-Agent": "Mozilla/5.0"
93
- })
94
- response.raise_for_status()
95
- soup = BeautifulSoup(response.text, "html.parser")
96
-
97
- # Extract visible <p> tags to simulate main content
98
- paragraphs = soup.find_all("p")
99
- content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
100
-
101
- if content and len(content) > 200: # crude filter to skip empty or useless pages
102
- doc = Document(
103
- page_content=content,
104
- metadata={
105
- "source": "Google News",
106
- "title": article.get("title", ""),
107
- "published": article.get("date", ""),
108
- "link": url,
109
- }
110
- )
111
- documents.append(doc)
112
-
113
- if i > max_articles:
114
- st.caption("max articles reached...")
115
- break
116
-
117
- i+=1
118
- except Exception as e:
119
- # st.error(f"Failed to fetch or parse article: {url} — Error: {e}")
120
- pass
121
 
122
- return documents
 
 
 
 
 
123
 
124
  if activities == "Symbol Analysis":
125
  ticker_user = st.text_input("Enter Ticker for NSE Stocks","")
 
45
 
46
  activities = st.sidebar.selectbox("Select", ["Symbol Analysis", "News Sentiment"])
47
 
48
+ def clean_google_news_url(url: str):
49
+ """
50
+ Cleans Google News redirect URLs by removing tracking parameters like &ved= and &usg=.
51
+ Keeps content up to .html or .cms.
52
+ """
53
+ for ext in [".html", ".cms"]:
54
+ if ext in url:
55
+ return url.split(ext)[0] + ext
56
+ return url.split("&")[0] # fallback
57
+ def get_google_news_documents(query: str, max_articles: int = 10, timeout: int = 10):
58
+ """
59
+ Fetches news articles from Google News and returns a list of LangChain Document objects,
60
+ using requests + BeautifulSoup instead of newspaper3k.
61
+
62
+ Args:
63
+ query (str): Search query for Google News.
64
+ max_articles (int): Number of articles to fetch.
65
+ timeout (int): Timeout for HTTP requests.
66
+
67
+ Returns:
68
+ List[Document]: Parsed article content as LangChain Document objects.
69
+ """
70
+ st.caption(f"Fetching articles for query: '{query}'")
71
 
72
+ googlenews = GoogleNews(lang="en")
73
+ # Set time range to last `days` days
74
+ end_date = datetime.today()
75
+ days = 2
76
+ start_date = end_date - timedelta(days=days)
77
+ googlenews.set_time_range(start_date.strftime("%m/%d/%Y"), end_date.strftime("%m/%d/%Y"))
 
 
 
 
 
78
 
79
+ googlenews.search(query)
80
+ articles = googlenews.result()
81
+
82
+ documents = []
83
+ i=1
84
+ for article in articles:
85
 
 
 
 
86
 
87
+ url = clean_google_news_url(article.get("link"))
88
+ try:
89
+ with st.spinner(f" Trying URL... {url}"):
90
+ # st.caption()
91
+ response = requests.get(url, timeout=timeout, headers={
92
+ "User-Agent": "Mozilla/5.0"
93
+ })
94
+ response.raise_for_status()
95
+ soup = BeautifulSoup(response.text, "html.parser")
96
+
97
+ # Extract visible <p> tags to simulate main content
98
+ paragraphs = soup.find_all("p")
99
+ content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
100
+
101
+ if content and len(content) > 200: # crude filter to skip empty or useless pages
102
+ doc = Document(
103
+ page_content=content,
104
+ metadata={
105
+ "source": "Google News",
106
+ "title": article.get("title", ""),
107
+ "published": article.get("date", ""),
108
+ "link": url,
109
+ }
110
+ )
111
+ documents.append(doc)
112
+
113
+ if i > max_articles:
114
+ st.caption("max articles reached...")
115
+ break
 
 
 
 
 
 
116
 
117
+ i+=1
118
+ except Exception as e:
119
+ # st.error(f"Failed to fetch or parse article: {url} — Error: {e}")
120
+ pass
121
+
122
+ return documents
123
 
124
  if activities == "Symbol Analysis":
125
  ticker_user = st.text_input("Enter Ticker for NSE Stocks","")