mobenta commited on
Commit
df71548
·
verified ·
1 Parent(s): 3e30f65

Delete streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +0 -316
streamlit_app.py DELETED
@@ -1,316 +0,0 @@
1
-
2
-
3
- import streamlit as st
4
- from streamlit_tags import st_tags_sidebar
5
- import pandas as pd
6
- import json
7
- from datetime import datetime
8
- from scraper import fetch_html_selenium, save_raw_data, format_data, save_formatted_data, calculate_price, html_to_markdown_with_readability, create_dynamic_listing_model, create_listings_container_model, scrape_url
9
- from pagination_detector import detect_pagination_elements, PaginationData
10
- import re
11
- from urllib.parse import urlparse
12
- from assets import PRICING
13
- import os
14
- from pydantic import BaseModel
15
-
16
-
17
- def serialize_pydantic(obj):
18
- if isinstance(obj, BaseModel):
19
- return obj.dict()
20
- raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')
21
-
22
- # Initialize Streamlit app
23
- st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
24
- st.title("Universal Web Scraper 🦑")
25
-
26
- # Initialize session state variables if they don't exist
27
- if 'results' not in st.session_state:
28
- st.session_state['results'] = None
29
- if 'perform_scrape' not in st.session_state:
30
- st.session_state['perform_scrape'] = False
31
-
32
- # Sidebar components
33
- st.sidebar.title("Web Scraper Settings")
34
- model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
35
- url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
36
-
37
- # Add toggle to show/hide tags field
38
- show_tags = st.sidebar.toggle("Enable Scraping")
39
-
40
- # Conditionally show tags input based on the toggle
41
- tags = []
42
- if show_tags:
43
- tags = st_tags_sidebar(
44
- label='Enter Fields to Extract:',
45
- text='Press enter to add a tag',
46
- value=[],
47
- suggestions=[],
48
- maxtags=-1,
49
- key='tags_input'
50
- )
51
-
52
- st.sidebar.markdown("---")
53
- # Add pagination toggle and input
54
- use_pagination = st.sidebar.toggle("Enable Pagination")
55
- pagination_details = None
56
- if use_pagination:
57
- pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)",
58
- help="Describe how to navigate through pages (e.g., 'Next' button class, URL pattern)")
59
-
60
- st.sidebar.markdown("---")
61
-
62
-
63
- def generate_unique_folder_name(url):
64
- timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
65
-
66
- # Parse the URL
67
- parsed_url = urlparse(url)
68
-
69
- # Extract the domain name
70
- domain = parsed_url.netloc or parsed_url.path.split('/')[0]
71
-
72
- # Remove 'www.' if present
73
- domain = re.sub(r'^www\.', '', domain)
74
-
75
- # Remove any non-alphanumeric characters and replace with underscores
76
- clean_domain = re.sub(r'\W+', '_', domain)
77
-
78
- return f"{clean_domain}_{timestamp}"
79
-
80
- def scrape_multiple_urls(urls, fields, selected_model):
81
- output_folder = os.path.join('output', generate_unique_folder_name(urls[0]))
82
- os.makedirs(output_folder, exist_ok=True)
83
-
84
- total_input_tokens = 0
85
- total_output_tokens = 0
86
- total_cost = 0
87
- all_data = []
88
- first_url_markdown = None
89
-
90
- for i, url in enumerate(urls, start=1):
91
- raw_html = fetch_html_selenium(url)
92
- markdown = html_to_markdown_with_readability(raw_html)
93
- if i == 1:
94
- first_url_markdown = markdown
95
-
96
- input_tokens, output_tokens, cost, formatted_data = scrape_url(url, fields, selected_model, output_folder, i, markdown)
97
- total_input_tokens += input_tokens
98
- total_output_tokens += output_tokens
99
- total_cost += cost
100
- all_data.append(formatted_data)
101
-
102
- return output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, first_url_markdown
103
-
104
- # Define the scraping function
105
- def perform_scrape():
106
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
107
- raw_html = fetch_html_selenium(url_input)
108
- markdown = html_to_markdown_with_readability(raw_html)
109
- save_raw_data(markdown, timestamp)
110
-
111
- # Detect pagination if enabled
112
- pagination_info = None
113
- if use_pagination:
114
- pagination_data, token_counts, pagination_price = detect_pagination_elements(
115
- url_input, pagination_details, model_selection, markdown
116
- )
117
- pagination_info = {
118
- "page_urls": pagination_data.page_urls,
119
- "token_counts": token_counts,
120
- "price": pagination_price
121
- }
122
-
123
- # Initialize token and cost variables with default values
124
- input_tokens = 0
125
- output_tokens = 0
126
- total_cost = 0
127
-
128
- if show_tags:
129
- DynamicListingModel = create_dynamic_listing_model(tags)
130
- DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
131
- formatted_data, tokens_count = format_data(
132
- markdown, DynamicListingsContainer, DynamicListingModel, model_selection
133
- )
134
- input_tokens, output_tokens, total_cost = calculate_price(tokens_count, model=model_selection)
135
- df = save_formatted_data(formatted_data, timestamp)
136
- else:
137
- formatted_data = None
138
- df = None
139
-
140
- return df, formatted_data, markdown, input_tokens, output_tokens, total_cost, timestamp, pagination_info
141
-
142
- if st.sidebar.button("Scrape"):
143
- with st.spinner('Please wait... Data is being scraped.'):
144
- urls = url_input.split()
145
- field_list = tags
146
- output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, first_url_markdown = scrape_multiple_urls(urls, field_list, model_selection)
147
-
148
- # Perform pagination if enabled and only one URL is provided
149
- pagination_info = None
150
- if use_pagination and len(urls) == 1:
151
- try:
152
- pagination_result = detect_pagination_elements(
153
- urls[0], pagination_details, model_selection, first_url_markdown
154
- )
155
-
156
- if pagination_result is not None:
157
- pagination_data, token_counts, pagination_price = pagination_result
158
-
159
- # Handle both PaginationData objects and dictionaries
160
- if isinstance(pagination_data, PaginationData):
161
- page_urls = pagination_data.page_urls
162
- elif isinstance(pagination_data, dict):
163
- page_urls = pagination_data.get("page_urls", [])
164
- else:
165
- page_urls = []
166
-
167
- pagination_info = {
168
- "page_urls": page_urls,
169
- "token_counts": token_counts,
170
- "price": pagination_price
171
- }
172
- else:
173
- st.warning("Pagination detection returned None. No pagination information available.")
174
- except Exception as e:
175
- st.error(f"An error occurred during pagination detection: {e}")
176
- pagination_info = {
177
- "page_urls": [],
178
- "token_counts": {"input_tokens": 0, "output_tokens": 0},
179
- "price": 0.0
180
- }
181
-
182
- st.session_state['results'] = (all_data, None, first_url_markdown, total_input_tokens, total_output_tokens, total_cost, output_folder, pagination_info)
183
- st.session_state['perform_scrape'] = True
184
-
185
- # Display results if they exist in session state
186
- if st.session_state['results']:
187
- all_data, _, _, input_tokens, output_tokens, total_cost, output_folder, pagination_info = st.session_state['results']
188
-
189
- # Display scraping details in sidebar only if scraping was performed and the toggle is on
190
- if all_data and show_tags:
191
- st.sidebar.markdown("---")
192
- st.sidebar.markdown("### Scraping Details")
193
- st.sidebar.markdown("#### Token Usage")
194
- st.sidebar.markdown(f"*Input Tokens:* {input_tokens}")
195
- st.sidebar.markdown(f"*Output Tokens:* {output_tokens}")
196
- st.sidebar.markdown(f"**Total Cost:** :green-background[**${total_cost:.4f}**]")
197
-
198
- # Display scraped data in main area
199
- st.subheader("Scraped/Parsed Data")
200
- for i, data in enumerate(all_data, start=1):
201
- st.write(f"Data from URL {i}:")
202
-
203
- # Handle string data (convert to dict if it's JSON)
204
- if isinstance(data, str):
205
- try:
206
- data = json.loads(data)
207
- except json.JSONDecodeError:
208
- st.error(f"Failed to parse data as JSON for URL {i}")
209
- continue
210
-
211
- if isinstance(data, dict):
212
- if 'listings' in data and isinstance(data['listings'], list):
213
- df = pd.DataFrame(data['listings'])
214
- else:
215
- # If 'listings' is not in the dict or not a list, use the entire dict
216
- df = pd.DataFrame([data])
217
- elif hasattr(data, 'listings') and isinstance(data.listings, list):
218
- # Handle the case where data is a Pydantic model
219
- listings = [item.dict() for item in data.listings]
220
- df = pd.DataFrame(listings)
221
- else:
222
- st.error(f"Unexpected data format for URL {i}")
223
- continue
224
-
225
- # Display the dataframe
226
- st.dataframe(df, use_container_width=True)
227
-
228
- # Download options
229
- st.subheader("Download Options")
230
- col1, col2 = st.columns(2)
231
- with col1:
232
- json_data = json.dumps(all_data, default=lambda o: o.dict() if hasattr(o, 'dict') else str(o), indent=4)
233
- st.download_button(
234
- "Download JSON",
235
- data=json_data,
236
- file_name="scraped_data.json"
237
- )
238
- with col2:
239
- # Convert all data to a single DataFrame
240
- all_listings = []
241
- for data in all_data:
242
- if isinstance(data, str):
243
- try:
244
- data = json.loads(data)
245
- except json.JSONDecodeError:
246
- continue
247
- if isinstance(data, dict) and 'listings' in data:
248
- all_listings.extend(data['listings'])
249
- elif hasattr(data, 'listings'):
250
- all_listings.extend([item.dict() for item in data.listings])
251
- else:
252
- all_listings.append(data)
253
-
254
- combined_df = pd.DataFrame(all_listings)
255
- st.download_button(
256
- "Download CSV",
257
- data=combined_df.to_csv(index=False),
258
- file_name="scraped_data.csv"
259
- )
260
-
261
- st.success(f"Scraping completed. Results saved in {output_folder}")
262
-
263
- # Add pagination details to sidebar
264
- if pagination_info and use_pagination:
265
- st.sidebar.markdown("---")
266
- st.sidebar.markdown("### Pagination Details")
267
- st.sidebar.markdown(f"**Number of Page URLs:** {len(pagination_info['page_urls'])}")
268
- st.sidebar.markdown("#### Pagination Token Usage")
269
- st.sidebar.markdown(f"*Input Tokens:* {pagination_info['token_counts']['input_tokens']}")
270
- st.sidebar.markdown(f"*Output Tokens:* {pagination_info['token_counts']['output_tokens']}")
271
- st.sidebar.markdown(f"**Pagination Cost:** :red-background[**${pagination_info['price']:.4f}**]")
272
-
273
- st.markdown("---")
274
- st.subheader("Pagination Information")
275
- pagination_df = pd.DataFrame(pagination_info["page_urls"], columns=["Page URLs"])
276
-
277
- st.dataframe(
278
- pagination_df,
279
- column_config={
280
- "Page URLs": st.column_config.LinkColumn("Page URLs")
281
- },use_container_width=True
282
- )
283
-
284
- # Create columns for download buttons
285
- col1, col2 = st.columns(2)
286
- with col1:
287
- st.download_button(
288
- "Download Pagination JSON",
289
- data=json.dumps(pagination_info["page_urls"], indent=4),
290
- file_name=f"pagination_urls.json"
291
- )
292
- with col2:
293
- st.download_button(
294
- "Download Pagination CSV",
295
- data=pagination_df.to_csv(index=False),
296
- file_name=f"pagination_urls.csv"
297
- )
298
-
299
- # Display combined totals only if both scraping and pagination were performed and both toggles are on
300
- if all_data and pagination_info and show_tags and use_pagination:
301
- st.markdown("---")
302
- total_input_tokens = input_tokens + pagination_info['token_counts']['input_tokens']
303
- total_output_tokens = output_tokens + pagination_info['token_counts']['output_tokens']
304
- total_combined_cost = total_cost + pagination_info['price']
305
- st.markdown("### Total Counts and Cost (Including Pagination)")
306
- st.markdown(f"**Total Input Tokens:** {total_input_tokens}")
307
- st.markdown(f"**Total Output Tokens:** {total_output_tokens}")
308
- st.markdown(f"**Total Combined Cost:** :green[**${total_combined_cost:.4f}**]")
309
-
310
- # Add a clear results button
311
- if st.sidebar.button("Clear Results"):
312
- st.session_state['results'] = None
313
- st.session_state['perform_scrape'] = False
314
- st.rerun()
315
-
316
-