mobenta commited on
Commit
1990058
·
verified ·
1 Parent(s): a60c7b3

Rename streamlit_app.py.txt to streamlit_app.py

Browse files
streamlit_app.py.txt → streamlit_app.py RENAMED
@@ -1,317 +1,316 @@
1
- streamlit_app.py
2
-
3
-
4
- import streamlit as st
5
- from streamlit_tags import st_tags_sidebar
6
- import pandas as pd
7
- import json
8
- from datetime import datetime
9
- from scraper import fetch_html_selenium, save_raw_data, format_data, save_formatted_data, calculate_price, html_to_markdown_with_readability, create_dynamic_listing_model, create_listings_container_model, scrape_url
10
- from pagination_detector import detect_pagination_elements, PaginationData
11
- import re
12
- from urllib.parse import urlparse
13
- from assets import PRICING
14
- import os
15
- from pydantic import BaseModel
16
-
17
-
18
- def serialize_pydantic(obj):
19
- if isinstance(obj, BaseModel):
20
- return obj.dict()
21
- raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')
22
-
23
- # Initialize Streamlit app
24
- st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
25
- st.title("Universal Web Scraper 🦑")
26
-
27
- # Initialize session state variables if they don't exist
28
- if 'results' not in st.session_state:
29
- st.session_state['results'] = None
30
- if 'perform_scrape' not in st.session_state:
31
- st.session_state['perform_scrape'] = False
32
-
33
- # Sidebar components
34
- st.sidebar.title("Web Scraper Settings")
35
- model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
36
- url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
37
-
38
- # Add toggle to show/hide tags field
39
- show_tags = st.sidebar.toggle("Enable Scraping")
40
-
41
- # Conditionally show tags input based on the toggle
42
- tags = []
43
- if show_tags:
44
- tags = st_tags_sidebar(
45
- label='Enter Fields to Extract:',
46
- text='Press enter to add a tag',
47
- value=[],
48
- suggestions=[],
49
- maxtags=-1,
50
- key='tags_input'
51
- )
52
-
53
- st.sidebar.markdown("---")
54
- # Add pagination toggle and input
55
- use_pagination = st.sidebar.toggle("Enable Pagination")
56
- pagination_details = None
57
- if use_pagination:
58
- pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)",
59
- help="Describe how to navigate through pages (e.g., 'Next' button class, URL pattern)")
60
-
61
- st.sidebar.markdown("---")
62
-
63
-
64
- def generate_unique_folder_name(url):
65
- timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
66
-
67
- # Parse the URL
68
- parsed_url = urlparse(url)
69
-
70
- # Extract the domain name
71
- domain = parsed_url.netloc or parsed_url.path.split('/')[0]
72
-
73
- # Remove 'www.' if present
74
- domain = re.sub(r'^www\.', '', domain)
75
-
76
- # Remove any non-alphanumeric characters and replace with underscores
77
- clean_domain = re.sub(r'\W+', '_', domain)
78
-
79
- return f"{clean_domain}_{timestamp}"
80
-
81
- def scrape_multiple_urls(urls, fields, selected_model):
82
- output_folder = os.path.join('output', generate_unique_folder_name(urls[0]))
83
- os.makedirs(output_folder, exist_ok=True)
84
-
85
- total_input_tokens = 0
86
- total_output_tokens = 0
87
- total_cost = 0
88
- all_data = []
89
- first_url_markdown = None
90
-
91
- for i, url in enumerate(urls, start=1):
92
- raw_html = fetch_html_selenium(url)
93
- markdown = html_to_markdown_with_readability(raw_html)
94
- if i == 1:
95
- first_url_markdown = markdown
96
-
97
- input_tokens, output_tokens, cost, formatted_data = scrape_url(url, fields, selected_model, output_folder, i, markdown)
98
- total_input_tokens += input_tokens
99
- total_output_tokens += output_tokens
100
- total_cost += cost
101
- all_data.append(formatted_data)
102
-
103
- return output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, first_url_markdown
104
-
105
- # Define the scraping function
106
- def perform_scrape():
107
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
108
- raw_html = fetch_html_selenium(url_input)
109
- markdown = html_to_markdown_with_readability(raw_html)
110
- save_raw_data(markdown, timestamp)
111
-
112
- # Detect pagination if enabled
113
- pagination_info = None
114
- if use_pagination:
115
- pagination_data, token_counts, pagination_price = detect_pagination_elements(
116
- url_input, pagination_details, model_selection, markdown
117
- )
118
- pagination_info = {
119
- "page_urls": pagination_data.page_urls,
120
- "token_counts": token_counts,
121
- "price": pagination_price
122
- }
123
-
124
- # Initialize token and cost variables with default values
125
- input_tokens = 0
126
- output_tokens = 0
127
- total_cost = 0
128
-
129
- if show_tags:
130
- DynamicListingModel = create_dynamic_listing_model(tags)
131
- DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
132
- formatted_data, tokens_count = format_data(
133
- markdown, DynamicListingsContainer, DynamicListingModel, model_selection
134
- )
135
- input_tokens, output_tokens, total_cost = calculate_price(tokens_count, model=model_selection)
136
- df = save_formatted_data(formatted_data, timestamp)
137
- else:
138
- formatted_data = None
139
- df = None
140
-
141
- return df, formatted_data, markdown, input_tokens, output_tokens, total_cost, timestamp, pagination_info
142
-
143
- if st.sidebar.button("Scrape"):
144
- with st.spinner('Please wait... Data is being scraped.'):
145
- urls = url_input.split()
146
- field_list = tags
147
- output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, first_url_markdown = scrape_multiple_urls(urls, field_list, model_selection)
148
-
149
- # Perform pagination if enabled and only one URL is provided
150
- pagination_info = None
151
- if use_pagination and len(urls) == 1:
152
- try:
153
- pagination_result = detect_pagination_elements(
154
- urls[0], pagination_details, model_selection, first_url_markdown
155
- )
156
-
157
- if pagination_result is not None:
158
- pagination_data, token_counts, pagination_price = pagination_result
159
-
160
- # Handle both PaginationData objects and dictionaries
161
- if isinstance(pagination_data, PaginationData):
162
- page_urls = pagination_data.page_urls
163
- elif isinstance(pagination_data, dict):
164
- page_urls = pagination_data.get("page_urls", [])
165
- else:
166
- page_urls = []
167
-
168
- pagination_info = {
169
- "page_urls": page_urls,
170
- "token_counts": token_counts,
171
- "price": pagination_price
172
- }
173
- else:
174
- st.warning("Pagination detection returned None. No pagination information available.")
175
- except Exception as e:
176
- st.error(f"An error occurred during pagination detection: {e}")
177
- pagination_info = {
178
- "page_urls": [],
179
- "token_counts": {"input_tokens": 0, "output_tokens": 0},
180
- "price": 0.0
181
- }
182
-
183
- st.session_state['results'] = (all_data, None, first_url_markdown, total_input_tokens, total_output_tokens, total_cost, output_folder, pagination_info)
184
- st.session_state['perform_scrape'] = True
185
-
186
- # Display results if they exist in session state
187
- if st.session_state['results']:
188
- all_data, _, _, input_tokens, output_tokens, total_cost, output_folder, pagination_info = st.session_state['results']
189
-
190
- # Display scraping details in sidebar only if scraping was performed and the toggle is on
191
- if all_data and show_tags:
192
- st.sidebar.markdown("---")
193
- st.sidebar.markdown("### Scraping Details")
194
- st.sidebar.markdown("#### Token Usage")
195
- st.sidebar.markdown(f"*Input Tokens:* {input_tokens}")
196
- st.sidebar.markdown(f"*Output Tokens:* {output_tokens}")
197
- st.sidebar.markdown(f"**Total Cost:** :green-background[**${total_cost:.4f}**]")
198
-
199
- # Display scraped data in main area
200
- st.subheader("Scraped/Parsed Data")
201
- for i, data in enumerate(all_data, start=1):
202
- st.write(f"Data from URL {i}:")
203
-
204
- # Handle string data (convert to dict if it's JSON)
205
- if isinstance(data, str):
206
- try:
207
- data = json.loads(data)
208
- except json.JSONDecodeError:
209
- st.error(f"Failed to parse data as JSON for URL {i}")
210
- continue
211
-
212
- if isinstance(data, dict):
213
- if 'listings' in data and isinstance(data['listings'], list):
214
- df = pd.DataFrame(data['listings'])
215
- else:
216
- # If 'listings' is not in the dict or not a list, use the entire dict
217
- df = pd.DataFrame([data])
218
- elif hasattr(data, 'listings') and isinstance(data.listings, list):
219
- # Handle the case where data is a Pydantic model
220
- listings = [item.dict() for item in data.listings]
221
- df = pd.DataFrame(listings)
222
- else:
223
- st.error(f"Unexpected data format for URL {i}")
224
- continue
225
-
226
- # Display the dataframe
227
- st.dataframe(df, use_container_width=True)
228
-
229
- # Download options
230
- st.subheader("Download Options")
231
- col1, col2 = st.columns(2)
232
- with col1:
233
- json_data = json.dumps(all_data, default=lambda o: o.dict() if hasattr(o, 'dict') else str(o), indent=4)
234
- st.download_button(
235
- "Download JSON",
236
- data=json_data,
237
- file_name="scraped_data.json"
238
- )
239
- with col2:
240
- # Convert all data to a single DataFrame
241
- all_listings = []
242
- for data in all_data:
243
- if isinstance(data, str):
244
- try:
245
- data = json.loads(data)
246
- except json.JSONDecodeError:
247
- continue
248
- if isinstance(data, dict) and 'listings' in data:
249
- all_listings.extend(data['listings'])
250
- elif hasattr(data, 'listings'):
251
- all_listings.extend([item.dict() for item in data.listings])
252
- else:
253
- all_listings.append(data)
254
-
255
- combined_df = pd.DataFrame(all_listings)
256
- st.download_button(
257
- "Download CSV",
258
- data=combined_df.to_csv(index=False),
259
- file_name="scraped_data.csv"
260
- )
261
-
262
- st.success(f"Scraping completed. Results saved in {output_folder}")
263
-
264
- # Add pagination details to sidebar
265
- if pagination_info and use_pagination:
266
- st.sidebar.markdown("---")
267
- st.sidebar.markdown("### Pagination Details")
268
- st.sidebar.markdown(f"**Number of Page URLs:** {len(pagination_info['page_urls'])}")
269
- st.sidebar.markdown("#### Pagination Token Usage")
270
- st.sidebar.markdown(f"*Input Tokens:* {pagination_info['token_counts']['input_tokens']}")
271
- st.sidebar.markdown(f"*Output Tokens:* {pagination_info['token_counts']['output_tokens']}")
272
- st.sidebar.markdown(f"**Pagination Cost:** :red-background[**${pagination_info['price']:.4f}**]")
273
-
274
- st.markdown("---")
275
- st.subheader("Pagination Information")
276
- pagination_df = pd.DataFrame(pagination_info["page_urls"], columns=["Page URLs"])
277
-
278
- st.dataframe(
279
- pagination_df,
280
- column_config={
281
- "Page URLs": st.column_config.LinkColumn("Page URLs")
282
- },use_container_width=True
283
- )
284
-
285
- # Create columns for download buttons
286
- col1, col2 = st.columns(2)
287
- with col1:
288
- st.download_button(
289
- "Download Pagination JSON",
290
- data=json.dumps(pagination_info["page_urls"], indent=4),
291
- file_name=f"pagination_urls.json"
292
- )
293
- with col2:
294
- st.download_button(
295
- "Download Pagination CSV",
296
- data=pagination_df.to_csv(index=False),
297
- file_name=f"pagination_urls.csv"
298
- )
299
-
300
- # Display combined totals only if both scraping and pagination were performed and both toggles are on
301
- if all_data and pagination_info and show_tags and use_pagination:
302
- st.markdown("---")
303
- total_input_tokens = input_tokens + pagination_info['token_counts']['input_tokens']
304
- total_output_tokens = output_tokens + pagination_info['token_counts']['output_tokens']
305
- total_combined_cost = total_cost + pagination_info['price']
306
- st.markdown("### Total Counts and Cost (Including Pagination)")
307
- st.markdown(f"**Total Input Tokens:** {total_input_tokens}")
308
- st.markdown(f"**Total Output Tokens:** {total_output_tokens}")
309
- st.markdown(f"**Total Combined Cost:** :green[**${total_combined_cost:.4f}**]")
310
-
311
- # Add a clear results button
312
- if st.sidebar.button("Clear Results"):
313
- st.session_state['results'] = None
314
- st.session_state['perform_scrape'] = False
315
- st.rerun()
316
-
317
 
 
1
+
2
+
3
+ import streamlit as st
4
+ from streamlit_tags import st_tags_sidebar
5
+ import pandas as pd
6
+ import json
7
+ from datetime import datetime
8
+ from scraper import fetch_html_selenium, save_raw_data, format_data, save_formatted_data, calculate_price, html_to_markdown_with_readability, create_dynamic_listing_model, create_listings_container_model, scrape_url
9
+ from pagination_detector import detect_pagination_elements, PaginationData
10
+ import re
11
+ from urllib.parse import urlparse
12
+ from assets import PRICING
13
+ import os
14
+ from pydantic import BaseModel
15
+
16
+
17
+ def serialize_pydantic(obj):
18
+ if isinstance(obj, BaseModel):
19
+ return obj.dict()
20
+ raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')
21
+
22
+ # Initialize Streamlit app
23
+ st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
24
+ st.title("Universal Web Scraper 🦑")
25
+
26
+ # Initialize session state variables if they don't exist
27
+ if 'results' not in st.session_state:
28
+ st.session_state['results'] = None
29
+ if 'perform_scrape' not in st.session_state:
30
+ st.session_state['perform_scrape'] = False
31
+
32
+ # Sidebar components
33
+ st.sidebar.title("Web Scraper Settings")
34
+ model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
35
+ url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
36
+
37
+ # Add toggle to show/hide tags field
38
+ show_tags = st.sidebar.toggle("Enable Scraping")
39
+
40
+ # Conditionally show tags input based on the toggle
41
+ tags = []
42
+ if show_tags:
43
+ tags = st_tags_sidebar(
44
+ label='Enter Fields to Extract:',
45
+ text='Press enter to add a tag',
46
+ value=[],
47
+ suggestions=[],
48
+ maxtags=-1,
49
+ key='tags_input'
50
+ )
51
+
52
+ st.sidebar.markdown("---")
53
+ # Add pagination toggle and input
54
+ use_pagination = st.sidebar.toggle("Enable Pagination")
55
+ pagination_details = None
56
+ if use_pagination:
57
+ pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)",
58
+ help="Describe how to navigate through pages (e.g., 'Next' button class, URL pattern)")
59
+
60
+ st.sidebar.markdown("---")
61
+
62
+
63
+ def generate_unique_folder_name(url):
64
+ timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
65
+
66
+ # Parse the URL
67
+ parsed_url = urlparse(url)
68
+
69
+ # Extract the domain name
70
+ domain = parsed_url.netloc or parsed_url.path.split('/')[0]
71
+
72
+ # Remove 'www.' if present
73
+ domain = re.sub(r'^www\.', '', domain)
74
+
75
+ # Remove any non-alphanumeric characters and replace with underscores
76
+ clean_domain = re.sub(r'\W+', '_', domain)
77
+
78
+ return f"{clean_domain}_{timestamp}"
79
+
80
+ def scrape_multiple_urls(urls, fields, selected_model):
81
+ output_folder = os.path.join('output', generate_unique_folder_name(urls[0]))
82
+ os.makedirs(output_folder, exist_ok=True)
83
+
84
+ total_input_tokens = 0
85
+ total_output_tokens = 0
86
+ total_cost = 0
87
+ all_data = []
88
+ first_url_markdown = None
89
+
90
+ for i, url in enumerate(urls, start=1):
91
+ raw_html = fetch_html_selenium(url)
92
+ markdown = html_to_markdown_with_readability(raw_html)
93
+ if i == 1:
94
+ first_url_markdown = markdown
95
+
96
+ input_tokens, output_tokens, cost, formatted_data = scrape_url(url, fields, selected_model, output_folder, i, markdown)
97
+ total_input_tokens += input_tokens
98
+ total_output_tokens += output_tokens
99
+ total_cost += cost
100
+ all_data.append(formatted_data)
101
+
102
+ return output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, first_url_markdown
103
+
104
+ # Define the scraping function
105
+ def perform_scrape():
106
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
107
+ raw_html = fetch_html_selenium(url_input)
108
+ markdown = html_to_markdown_with_readability(raw_html)
109
+ save_raw_data(markdown, timestamp)
110
+
111
+ # Detect pagination if enabled
112
+ pagination_info = None
113
+ if use_pagination:
114
+ pagination_data, token_counts, pagination_price = detect_pagination_elements(
115
+ url_input, pagination_details, model_selection, markdown
116
+ )
117
+ pagination_info = {
118
+ "page_urls": pagination_data.page_urls,
119
+ "token_counts": token_counts,
120
+ "price": pagination_price
121
+ }
122
+
123
+ # Initialize token and cost variables with default values
124
+ input_tokens = 0
125
+ output_tokens = 0
126
+ total_cost = 0
127
+
128
+ if show_tags:
129
+ DynamicListingModel = create_dynamic_listing_model(tags)
130
+ DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
131
+ formatted_data, tokens_count = format_data(
132
+ markdown, DynamicListingsContainer, DynamicListingModel, model_selection
133
+ )
134
+ input_tokens, output_tokens, total_cost = calculate_price(tokens_count, model=model_selection)
135
+ df = save_formatted_data(formatted_data, timestamp)
136
+ else:
137
+ formatted_data = None
138
+ df = None
139
+
140
+ return df, formatted_data, markdown, input_tokens, output_tokens, total_cost, timestamp, pagination_info
141
+
142
+ if st.sidebar.button("Scrape"):
143
+ with st.spinner('Please wait... Data is being scraped.'):
144
+ urls = url_input.split()
145
+ field_list = tags
146
+ output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, first_url_markdown = scrape_multiple_urls(urls, field_list, model_selection)
147
+
148
+ # Perform pagination if enabled and only one URL is provided
149
+ pagination_info = None
150
+ if use_pagination and len(urls) == 1:
151
+ try:
152
+ pagination_result = detect_pagination_elements(
153
+ urls[0], pagination_details, model_selection, first_url_markdown
154
+ )
155
+
156
+ if pagination_result is not None:
157
+ pagination_data, token_counts, pagination_price = pagination_result
158
+
159
+ # Handle both PaginationData objects and dictionaries
160
+ if isinstance(pagination_data, PaginationData):
161
+ page_urls = pagination_data.page_urls
162
+ elif isinstance(pagination_data, dict):
163
+ page_urls = pagination_data.get("page_urls", [])
164
+ else:
165
+ page_urls = []
166
+
167
+ pagination_info = {
168
+ "page_urls": page_urls,
169
+ "token_counts": token_counts,
170
+ "price": pagination_price
171
+ }
172
+ else:
173
+ st.warning("Pagination detection returned None. No pagination information available.")
174
+ except Exception as e:
175
+ st.error(f"An error occurred during pagination detection: {e}")
176
+ pagination_info = {
177
+ "page_urls": [],
178
+ "token_counts": {"input_tokens": 0, "output_tokens": 0},
179
+ "price": 0.0
180
+ }
181
+
182
+ st.session_state['results'] = (all_data, None, first_url_markdown, total_input_tokens, total_output_tokens, total_cost, output_folder, pagination_info)
183
+ st.session_state['perform_scrape'] = True
184
+
185
+ # Display results if they exist in session state
186
+ if st.session_state['results']:
187
+ all_data, _, _, input_tokens, output_tokens, total_cost, output_folder, pagination_info = st.session_state['results']
188
+
189
+ # Display scraping details in sidebar only if scraping was performed and the toggle is on
190
+ if all_data and show_tags:
191
+ st.sidebar.markdown("---")
192
+ st.sidebar.markdown("### Scraping Details")
193
+ st.sidebar.markdown("#### Token Usage")
194
+ st.sidebar.markdown(f"*Input Tokens:* {input_tokens}")
195
+ st.sidebar.markdown(f"*Output Tokens:* {output_tokens}")
196
+ st.sidebar.markdown(f"**Total Cost:** :green-background[**${total_cost:.4f}**]")
197
+
198
+ # Display scraped data in main area
199
+ st.subheader("Scraped/Parsed Data")
200
+ for i, data in enumerate(all_data, start=1):
201
+ st.write(f"Data from URL {i}:")
202
+
203
+ # Handle string data (convert to dict if it's JSON)
204
+ if isinstance(data, str):
205
+ try:
206
+ data = json.loads(data)
207
+ except json.JSONDecodeError:
208
+ st.error(f"Failed to parse data as JSON for URL {i}")
209
+ continue
210
+
211
+ if isinstance(data, dict):
212
+ if 'listings' in data and isinstance(data['listings'], list):
213
+ df = pd.DataFrame(data['listings'])
214
+ else:
215
+ # If 'listings' is not in the dict or not a list, use the entire dict
216
+ df = pd.DataFrame([data])
217
+ elif hasattr(data, 'listings') and isinstance(data.listings, list):
218
+ # Handle the case where data is a Pydantic model
219
+ listings = [item.dict() for item in data.listings]
220
+ df = pd.DataFrame(listings)
221
+ else:
222
+ st.error(f"Unexpected data format for URL {i}")
223
+ continue
224
+
225
+ # Display the dataframe
226
+ st.dataframe(df, use_container_width=True)
227
+
228
+ # Download options
229
+ st.subheader("Download Options")
230
+ col1, col2 = st.columns(2)
231
+ with col1:
232
+ json_data = json.dumps(all_data, default=lambda o: o.dict() if hasattr(o, 'dict') else str(o), indent=4)
233
+ st.download_button(
234
+ "Download JSON",
235
+ data=json_data,
236
+ file_name="scraped_data.json"
237
+ )
238
+ with col2:
239
+ # Convert all data to a single DataFrame
240
+ all_listings = []
241
+ for data in all_data:
242
+ if isinstance(data, str):
243
+ try:
244
+ data = json.loads(data)
245
+ except json.JSONDecodeError:
246
+ continue
247
+ if isinstance(data, dict) and 'listings' in data:
248
+ all_listings.extend(data['listings'])
249
+ elif hasattr(data, 'listings'):
250
+ all_listings.extend([item.dict() for item in data.listings])
251
+ else:
252
+ all_listings.append(data)
253
+
254
+ combined_df = pd.DataFrame(all_listings)
255
+ st.download_button(
256
+ "Download CSV",
257
+ data=combined_df.to_csv(index=False),
258
+ file_name="scraped_data.csv"
259
+ )
260
+
261
+ st.success(f"Scraping completed. Results saved in {output_folder}")
262
+
263
+ # Add pagination details to sidebar
264
+ if pagination_info and use_pagination:
265
+ st.sidebar.markdown("---")
266
+ st.sidebar.markdown("### Pagination Details")
267
+ st.sidebar.markdown(f"**Number of Page URLs:** {len(pagination_info['page_urls'])}")
268
+ st.sidebar.markdown("#### Pagination Token Usage")
269
+ st.sidebar.markdown(f"*Input Tokens:* {pagination_info['token_counts']['input_tokens']}")
270
+ st.sidebar.markdown(f"*Output Tokens:* {pagination_info['token_counts']['output_tokens']}")
271
+ st.sidebar.markdown(f"**Pagination Cost:** :red-background[**${pagination_info['price']:.4f}**]")
272
+
273
+ st.markdown("---")
274
+ st.subheader("Pagination Information")
275
+ pagination_df = pd.DataFrame(pagination_info["page_urls"], columns=["Page URLs"])
276
+
277
+ st.dataframe(
278
+ pagination_df,
279
+ column_config={
280
+ "Page URLs": st.column_config.LinkColumn("Page URLs")
281
+ },use_container_width=True
282
+ )
283
+
284
+ # Create columns for download buttons
285
+ col1, col2 = st.columns(2)
286
+ with col1:
287
+ st.download_button(
288
+ "Download Pagination JSON",
289
+ data=json.dumps(pagination_info["page_urls"], indent=4),
290
+ file_name=f"pagination_urls.json"
291
+ )
292
+ with col2:
293
+ st.download_button(
294
+ "Download Pagination CSV",
295
+ data=pagination_df.to_csv(index=False),
296
+ file_name=f"pagination_urls.csv"
297
+ )
298
+
299
+ # Display combined totals only if both scraping and pagination were performed and both toggles are on
300
+ if all_data and pagination_info and show_tags and use_pagination:
301
+ st.markdown("---")
302
+ total_input_tokens = input_tokens + pagination_info['token_counts']['input_tokens']
303
+ total_output_tokens = output_tokens + pagination_info['token_counts']['output_tokens']
304
+ total_combined_cost = total_cost + pagination_info['price']
305
+ st.markdown("### Total Counts and Cost (Including Pagination)")
306
+ st.markdown(f"**Total Input Tokens:** {total_input_tokens}")
307
+ st.markdown(f"**Total Output Tokens:** {total_output_tokens}")
308
+ st.markdown(f"**Total Combined Cost:** :green[**${total_combined_cost:.4f}**]")
309
+
310
+ # Add a clear results button
311
+ if st.sidebar.button("Clear Results"):
312
+ st.session_state['results'] = None
313
+ st.session_state['perform_scrape'] = False
314
+ st.rerun()
315
+
 
316