mobenta commited on
Commit
a60c7b3
·
verified ·
1 Parent(s): c04f6ef

Rename scraper.py.txt to scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py.txt → scraper.py +457 -457
scraper.py.txt → scraper.py RENAMED
@@ -1,458 +1,458 @@
1
- scraper.py
2
-
3
- import os
4
- import random
5
- import time
6
- import re
7
- import json
8
- from datetime import datetime
9
- from typing import List, Dict, Type
10
-
11
- import pandas as pd
12
- from bs4 import BeautifulSoup
13
- from pydantic import BaseModel, Field, create_model
14
- import html2text
15
- import tiktoken
16
-
17
- from dotenv import load_dotenv
18
- from selenium import webdriver
19
- from selenium.webdriver.chrome.service import Service
20
- from selenium.webdriver.chrome.options import Options
21
- from selenium.webdriver.common.by import By
22
- from selenium.webdriver.common.action_chains import ActionChains
23
- from selenium.webdriver.support.ui import WebDriverWait
24
- from selenium.webdriver.support import expected_conditions as EC
25
-
26
-
27
- from openai import OpenAI
28
- import google.generativeai as genai
29
- from groq import Groq
30
-
31
-
32
- from assets import USER_AGENTS,PRICING,HEADLESS_OPTIONS,SYSTEM_MESSAGE,USER_MESSAGE,LLAMA_MODEL_FULLNAME,GROQ_LLAMA_MODEL_FULLNAME
33
- load_dotenv()
34
-
35
- # Set up the Chrome WebDriver options
36
-
37
- def setup_selenium():
38
- options = Options()
39
-
40
- # Randomly select a user agent from the imported list
41
- user_agent = random.choice(USER_AGENTS)
42
- options.add_argument(f"user-agent={user_agent}")
43
-
44
- # Add other options
45
- for option in HEADLESS_OPTIONS:
46
- options.add_argument(option)
47
-
48
- # Specify the path to the ChromeDriver
49
- service = Service(r"./chromedriver-win64/chromedriver.exe")
50
-
51
- # Initialize the WebDriver
52
- driver = webdriver.Chrome(service=service, options=options)
53
- return driver
54
-
55
- def click_accept_cookies(driver):
56
- """
57
- Tries to find and click on a cookie consent button. It looks for several common patterns.
58
- """
59
- try:
60
- # Wait for cookie popup to load
61
- WebDriverWait(driver, 10).until(
62
- EC.presence_of_element_located((By.XPATH, "//button | //a | //div"))
63
- )
64
-
65
- # Common text variations for cookie buttons
66
- accept_text_variations = [
67
- "accept", "agree", "allow", "consent", "continue", "ok", "I agree", "got it"
68
- ]
69
-
70
- # Iterate through different element types and common text variations
71
- for tag in ["button", "a", "div"]:
72
- for text in accept_text_variations:
73
- try:
74
- # Create an XPath to find the button by text
75
- element = driver.find_element(By.XPATH, f"//{tag}[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]")
76
- if element:
77
- element.click()
78
- print(f"Clicked the '{text}' button.")
79
- return
80
- except:
81
- continue
82
-
83
- print("No 'Accept Cookies' button found.")
84
-
85
- except Exception as e:
86
- print(f"Error finding 'Accept Cookies' button: {e}")
87
-
88
- def fetch_html_selenium(url):
89
- driver = setup_selenium()
90
- try:
91
- driver.get(url)
92
-
93
- # Add random delays to mimic human behavior
94
- time.sleep(1) # Adjust this to simulate time for user to read or interact
95
- driver.maximize_window()
96
-
97
-
98
- # Try to find and click the 'Accept Cookies' button
99
- # click_accept_cookies(driver)
100
-
101
- # Add more realistic actions like scrolling
102
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
103
- time.sleep(random.uniform(1.1, 1.8)) # Simulate time taken to scroll and read
104
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1.2);")
105
- time.sleep(random.uniform(1.1, 1.8))
106
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1);")
107
- time.sleep(random.uniform(1.1, 2.1))
108
- html = driver.page_source
109
- return html
110
- finally:
111
- driver.quit()
112
-
113
- def clean_html(html_content):
114
- soup = BeautifulSoup(html_content, 'html.parser')
115
-
116
- # Remove headers and footers based on common HTML tags or classes
117
- for element in soup.find_all(['header', 'footer']):
118
- element.decompose() # Remove these tags and their content
119
-
120
- return str(soup)
121
-
122
-
123
- def html_to_markdown_with_readability(html_content):
124
-
125
-
126
- cleaned_html = clean_html(html_content)
127
-
128
- # Convert to markdown
129
- markdown_converter = html2text.HTML2Text()
130
- markdown_converter.ignore_links = False
131
- markdown_content = markdown_converter.handle(cleaned_html)
132
-
133
- return markdown_content
134
-
135
-
136
-
137
- def save_raw_data(raw_data: str, output_folder: str, file_name: str):
138
- """Save raw markdown data to the specified output folder."""
139
- os.makedirs(output_folder, exist_ok=True)
140
- raw_output_path = os.path.join(output_folder, file_name)
141
- with open(raw_output_path, 'w', encoding='utf-8') as f:
142
- f.write(raw_data)
143
- print(f"Raw data saved to {raw_output_path}")
144
- return raw_output_path
145
-
146
-
147
- def remove_urls_from_file(file_path):
148
- # Regex pattern to find URLs
149
- url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
150
-
151
- # Construct the new file name
152
- base, ext = os.path.splitext(file_path)
153
- new_file_path = f"{base}_cleaned{ext}"
154
-
155
- # Read the original markdown content
156
- with open(file_path, 'r', encoding='utf-8') as file:
157
- markdown_content = file.read()
158
-
159
- # Replace all found URLs with an empty string
160
- cleaned_content = re.sub(url_pattern, '', markdown_content)
161
-
162
- # Write the cleaned content to a new file
163
- with open(new_file_path, 'w', encoding='utf-8') as file:
164
- file.write(cleaned_content)
165
- print(f"Cleaned file saved as: {new_file_path}")
166
- return cleaned_content
167
-
168
-
169
- def create_dynamic_listing_model(field_names: List[str]) -> Type[BaseModel]:
170
- """
171
- Dynamically creates a Pydantic model based on provided fields.
172
- field_name is a list of names of the fields to extract from the markdown.
173
- """
174
- # Create field definitions using aliases for Field parameters
175
- field_definitions = {field: (str, ...) for field in field_names}
176
- # Dynamically create the model with all field
177
- return create_model('DynamicListingModel', **field_definitions)
178
-
179
-
180
- def create_listings_container_model(listing_model: Type[BaseModel]) -> Type[BaseModel]:
181
- """
182
- Create a container model that holds a list of the given listing model.
183
- """
184
- return create_model('DynamicListingsContainer', listings=(List[listing_model], ...))
185
-
186
-
187
-
188
-
189
- def trim_to_token_limit(text, model, max_tokens=120000):
190
- encoder = tiktoken.encoding_for_model(model)
191
- tokens = encoder.encode(text)
192
- if len(tokens) > max_tokens:
193
- trimmed_text = encoder.decode(tokens[:max_tokens])
194
- return trimmed_text
195
- return text
196
-
197
- def generate_system_message(listing_model: BaseModel) -> str:
198
- """
199
- Dynamically generate a system message based on the fields in the provided listing model.
200
- """
201
- # Use the model_json_schema() method to introspect the Pydantic model
202
- schema_info = listing_model.model_json_schema()
203
-
204
- # Extract field descriptions from the schema
205
- field_descriptions = []
206
- for field_name, field_info in schema_info["properties"].items():
207
- # Get the field type from the schema info
208
- field_type = field_info["type"]
209
- field_descriptions.append(f'"{field_name}": "{field_type}"')
210
-
211
- # Create the JSON schema structure for the listings
212
- schema_structure = ",\n".join(field_descriptions)
213
-
214
- # Generate the system message dynamically
215
- system_message = f"""
216
- You are an intelligent text extraction and conversion assistant. Your task is to extract structured information
217
- from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text,
218
- with no additional commentary, explanations, or extraneous information.
219
- You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
220
- Please process the following text and provide the output in pure JSON format with no words before or after the JSON:
221
- Please ensure the output strictly follows this schema:
222
-
223
- {{
224
- "listings": [
225
- {{
226
- {schema_structure}
227
- }}
228
- ]
229
- }} """
230
-
231
- return system_message
232
-
233
-
234
-
235
- def format_data(data, DynamicListingsContainer, DynamicListingModel, selected_model):
236
- token_counts = {}
237
-
238
- if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]:
239
- # Use OpenAI API
240
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
241
- completion = client.beta.chat.completions.parse(
242
- model=selected_model,
243
- messages=[
244
- {"role": "system", "content": SYSTEM_MESSAGE},
245
- {"role": "user", "content": USER_MESSAGE + data},
246
- ],
247
- response_format=DynamicListingsContainer
248
- )
249
- # Calculate tokens using tiktoken
250
- encoder = tiktoken.encoding_for_model(selected_model)
251
- input_token_count = len(encoder.encode(USER_MESSAGE + data))
252
- output_token_count = len(encoder.encode(json.dumps(completion.choices[0].message.parsed.dict())))
253
- token_counts = {
254
- "input_tokens": input_token_count,
255
- "output_tokens": output_token_count
256
- }
257
- return completion.choices[0].message.parsed, token_counts
258
-
259
- elif selected_model == "gemini-1.5-flash":
260
- # Use Google Gemini API
261
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
262
- model = genai.GenerativeModel('gemini-1.5-flash',
263
- generation_config={
264
- "response_mime_type": "application/json",
265
- "response_schema": DynamicListingsContainer
266
- })
267
- prompt = SYSTEM_MESSAGE + "\n" + USER_MESSAGE + data
268
- # Count input tokens using Gemini's method
269
- input_tokens = model.count_tokens(prompt)
270
- completion = model.generate_content(prompt)
271
- # Extract token counts from usage_metadata
272
- usage_metadata = completion.usage_metadata
273
- token_counts = {
274
- "input_tokens": usage_metadata.prompt_token_count,
275
- "output_tokens": usage_metadata.candidates_token_count
276
- }
277
- return completion.text, token_counts
278
-
279
- elif selected_model == "Llama3.1 8B":
280
-
281
- # Dynamically generate the system message based on the schema
282
- sys_message = generate_system_message(DynamicListingModel)
283
- # print(SYSTEM_MESSAGE)
284
- # Point to the local server
285
- client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
286
-
287
- completion = client.chat.completions.create(
288
- model=LLAMA_MODEL_FULLNAME, #change this if needed (use a better model)
289
- messages=[
290
- {"role": "system", "content": sys_message},
291
- {"role": "user", "content": USER_MESSAGE + data}
292
- ],
293
- temperature=0.7,
294
-
295
- )
296
-
297
- # Extract the content from the response
298
- response_content = completion.choices[0].message.content
299
- print(response_content)
300
- # Convert the content from JSON string to a Python dictionary
301
- parsed_response = json.loads(response_content)
302
-
303
- # Extract token usage
304
- token_counts = {
305
- "input_tokens": completion.usage.prompt_tokens,
306
- "output_tokens": completion.usage.completion_tokens
307
- }
308
-
309
- return parsed_response, token_counts
310
- elif selected_model== "Groq Llama3.1 70b":
311
-
312
- # Dynamically generate the system message based on the schema
313
- sys_message = generate_system_message(DynamicListingModel)
314
- # print(SYSTEM_MESSAGE)
315
- # Point to the local server
316
- client = Groq(api_key=os.environ.get("GROQ_API_KEY"),)
317
-
318
- completion = client.chat.completions.create(
319
- messages=[
320
- {"role": "system","content": sys_message},
321
- {"role": "user","content": USER_MESSAGE + data}
322
- ],
323
- model=GROQ_LLAMA_MODEL_FULLNAME,
324
- )
325
-
326
- # Extract the content from the response
327
- response_content = completion.choices[0].message.content
328
-
329
- # Convert the content from JSON string to a Python dictionary
330
- parsed_response = json.loads(response_content)
331
-
332
- # completion.usage
333
- token_counts = {
334
- "input_tokens": completion.usage.prompt_tokens,
335
- "output_tokens": completion.usage.completion_tokens
336
- }
337
-
338
- return parsed_response, token_counts
339
- else:
340
- raise ValueError(f"Unsupported model: {selected_model}")
341
-
342
-
343
-
344
- def save_formatted_data(formatted_data, output_folder: str, json_file_name: str, excel_file_name: str):
345
- """Save formatted data as JSON and Excel in the specified output folder."""
346
- os.makedirs(output_folder, exist_ok=True)
347
-
348
- # Parse the formatted data if it's a JSON string (from Gemini API)
349
- if isinstance(formatted_data, str):
350
- try:
351
- formatted_data_dict = json.loads(formatted_data)
352
- except json.JSONDecodeError:
353
- raise ValueError("The provided formatted data is a string but not valid JSON.")
354
- else:
355
- # Handle data from OpenAI or other sources
356
- formatted_data_dict = formatted_data.dict() if hasattr(formatted_data, 'dict') else formatted_data
357
-
358
- # Save the formatted data as JSON
359
- json_output_path = os.path.join(output_folder, json_file_name)
360
- with open(json_output_path, 'w', encoding='utf-8') as f:
361
- json.dump(formatted_data_dict, f, indent=4)
362
- print(f"Formatted data saved to JSON at {json_output_path}")
363
-
364
- # Prepare data for DataFrame
365
- if isinstance(formatted_data_dict, dict):
366
- # If the data is a dictionary containing lists, assume these lists are records
367
- data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict
368
- elif isinstance(formatted_data_dict, list):
369
- data_for_df = formatted_data_dict
370
- else:
371
- raise ValueError("Formatted data is neither a dictionary nor a list, cannot convert to DataFrame")
372
-
373
- # Create DataFrame
374
- try:
375
- df = pd.DataFrame(data_for_df)
376
- print("DataFrame created successfully.")
377
-
378
- # Save the DataFrame to an Excel file
379
- excel_output_path = os.path.join(output_folder, excel_file_name)
380
- df.to_excel(excel_output_path, index=False)
381
- print(f"Formatted data saved to Excel at {excel_output_path}")
382
-
383
- return df
384
- except Exception as e:
385
- print(f"Error creating DataFrame or saving Excel: {str(e)}")
386
- return None
387
-
388
- def calculate_price(token_counts, model):
389
- input_token_count = token_counts.get("input_tokens", 0)
390
- output_token_count = token_counts.get("output_tokens", 0)
391
-
392
- # Calculate the costs
393
- input_cost = input_token_count * PRICING[model]["input"]
394
- output_cost = output_token_count * PRICING[model]["output"]
395
- total_cost = input_cost + output_cost
396
-
397
- return input_token_count, output_token_count, total_cost
398
-
399
-
400
- def generate_unique_folder_name(url):
401
- timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
402
- url_name = re.sub(r'\W+', '_', url.split('//')[1].split('/')[0]) # Extract domain name and replace non-alphanumeric characters
403
- return f"{url_name}_{timestamp}"
404
-
405
-
406
- def scrape_multiple_urls(urls, fields, selected_model):
407
- output_folder = os.path.join('output', generate_unique_folder_name(urls[0]))
408
- os.makedirs(output_folder, exist_ok=True)
409
-
410
- total_input_tokens = 0
411
- total_output_tokens = 0
412
- total_cost = 0
413
- all_data = []
414
- markdown = None # We'll store the markdown for the first (or only) URL
415
-
416
- for i, url in enumerate(urls, start=1):
417
- raw_html = fetch_html_selenium(url)
418
- current_markdown = html_to_markdown_with_readability(raw_html)
419
- if i == 1:
420
- markdown = current_markdown # Store markdown for the first URL
421
-
422
- input_tokens, output_tokens, cost, formatted_data = scrape_url(url, fields, selected_model, output_folder, i, current_markdown)
423
- total_input_tokens += input_tokens
424
- total_output_tokens += output_tokens
425
- total_cost += cost
426
- all_data.append(formatted_data)
427
-
428
- return output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, markdown
429
-
430
- def scrape_url(url: str, fields: List[str], selected_model: str, output_folder: str, file_number: int, markdown: str):
431
- """Scrape a single URL and save the results."""
432
- try:
433
- # Save raw data
434
- save_raw_data(markdown, output_folder, f'rawData_{file_number}.md')
435
-
436
- # Create the dynamic listing model
437
- DynamicListingModel = create_dynamic_listing_model(fields)
438
-
439
- # Create the container model that holds a list of the dynamic listing models
440
- DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
441
-
442
- # Format data
443
- formatted_data, token_counts = format_data(markdown, DynamicListingsContainer, DynamicListingModel, selected_model)
444
-
445
- # Save formatted data
446
- save_formatted_data(formatted_data, output_folder, f'sorted_data_{file_number}.json', f'sorted_data_{file_number}.xlsx')
447
-
448
- # Calculate and return token usage and cost
449
- input_tokens, output_tokens, total_cost = calculate_price(token_counts, selected_model)
450
- return input_tokens, output_tokens, total_cost, formatted_data
451
-
452
- except Exception as e:
453
- print(f"An error occurred while processing {url}: {e}")
454
- return 0, 0, 0, None
455
-
456
-
457
-
458
 
 
1
+
2
+
3
+ import os
4
+ import random
5
+ import time
6
+ import re
7
+ import json
8
+ from datetime import datetime
9
+ from typing import List, Dict, Type
10
+
11
+ import pandas as pd
12
+ from bs4 import BeautifulSoup
13
+ from pydantic import BaseModel, Field, create_model
14
+ import html2text
15
+ import tiktoken
16
+
17
+ from dotenv import load_dotenv
18
+ from selenium import webdriver
19
+ from selenium.webdriver.chrome.service import Service
20
+ from selenium.webdriver.chrome.options import Options
21
+ from selenium.webdriver.common.by import By
22
+ from selenium.webdriver.common.action_chains import ActionChains
23
+ from selenium.webdriver.support.ui import WebDriverWait
24
+ from selenium.webdriver.support import expected_conditions as EC
25
+
26
+
27
+ from openai import OpenAI
28
+ import google.generativeai as genai
29
+ from groq import Groq
30
+
31
+
32
+ from assets import USER_AGENTS,PRICING,HEADLESS_OPTIONS,SYSTEM_MESSAGE,USER_MESSAGE,LLAMA_MODEL_FULLNAME,GROQ_LLAMA_MODEL_FULLNAME
33
+ load_dotenv()
34
+
35
+ # Set up the Chrome WebDriver options
36
+
37
+ def setup_selenium():
38
+ options = Options()
39
+
40
+ # Randomly select a user agent from the imported list
41
+ user_agent = random.choice(USER_AGENTS)
42
+ options.add_argument(f"user-agent={user_agent}")
43
+
44
+ # Add other options
45
+ for option in HEADLESS_OPTIONS:
46
+ options.add_argument(option)
47
+
48
+ # Specify the path to the ChromeDriver
49
+ service = Service(r"./chromedriver-win64/chromedriver.exe")
50
+
51
+ # Initialize the WebDriver
52
+ driver = webdriver.Chrome(service=service, options=options)
53
+ return driver
54
+
55
+ def click_accept_cookies(driver):
56
+ """
57
+ Tries to find and click on a cookie consent button. It looks for several common patterns.
58
+ """
59
+ try:
60
+ # Wait for cookie popup to load
61
+ WebDriverWait(driver, 10).until(
62
+ EC.presence_of_element_located((By.XPATH, "//button | //a | //div"))
63
+ )
64
+
65
+ # Common text variations for cookie buttons
66
+ accept_text_variations = [
67
+ "accept", "agree", "allow", "consent", "continue", "ok", "I agree", "got it"
68
+ ]
69
+
70
+ # Iterate through different element types and common text variations
71
+ for tag in ["button", "a", "div"]:
72
+ for text in accept_text_variations:
73
+ try:
74
+ # Create an XPath to find the button by text
75
+ element = driver.find_element(By.XPATH, f"//{tag}[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]")
76
+ if element:
77
+ element.click()
78
+ print(f"Clicked the '{text}' button.")
79
+ return
80
+ except:
81
+ continue
82
+
83
+ print("No 'Accept Cookies' button found.")
84
+
85
+ except Exception as e:
86
+ print(f"Error finding 'Accept Cookies' button: {e}")
87
+
88
+ def fetch_html_selenium(url):
89
+ driver = setup_selenium()
90
+ try:
91
+ driver.get(url)
92
+
93
+ # Add random delays to mimic human behavior
94
+ time.sleep(1) # Adjust this to simulate time for user to read or interact
95
+ driver.maximize_window()
96
+
97
+
98
+ # Try to find and click the 'Accept Cookies' button
99
+ # click_accept_cookies(driver)
100
+
101
+ # Add more realistic actions like scrolling
102
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
103
+ time.sleep(random.uniform(1.1, 1.8)) # Simulate time taken to scroll and read
104
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1.2);")
105
+ time.sleep(random.uniform(1.1, 1.8))
106
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1);")
107
+ time.sleep(random.uniform(1.1, 2.1))
108
+ html = driver.page_source
109
+ return html
110
+ finally:
111
+ driver.quit()
112
+
113
+ def clean_html(html_content):
114
+ soup = BeautifulSoup(html_content, 'html.parser')
115
+
116
+ # Remove headers and footers based on common HTML tags or classes
117
+ for element in soup.find_all(['header', 'footer']):
118
+ element.decompose() # Remove these tags and their content
119
+
120
+ return str(soup)
121
+
122
+
123
+ def html_to_markdown_with_readability(html_content):
124
+
125
+
126
+ cleaned_html = clean_html(html_content)
127
+
128
+ # Convert to markdown
129
+ markdown_converter = html2text.HTML2Text()
130
+ markdown_converter.ignore_links = False
131
+ markdown_content = markdown_converter.handle(cleaned_html)
132
+
133
+ return markdown_content
134
+
135
+
136
+
137
+ def save_raw_data(raw_data: str, output_folder: str, file_name: str):
138
+ """Save raw markdown data to the specified output folder."""
139
+ os.makedirs(output_folder, exist_ok=True)
140
+ raw_output_path = os.path.join(output_folder, file_name)
141
+ with open(raw_output_path, 'w', encoding='utf-8') as f:
142
+ f.write(raw_data)
143
+ print(f"Raw data saved to {raw_output_path}")
144
+ return raw_output_path
145
+
146
+
147
+ def remove_urls_from_file(file_path):
148
+ # Regex pattern to find URLs
149
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
150
+
151
+ # Construct the new file name
152
+ base, ext = os.path.splitext(file_path)
153
+ new_file_path = f"{base}_cleaned{ext}"
154
+
155
+ # Read the original markdown content
156
+ with open(file_path, 'r', encoding='utf-8') as file:
157
+ markdown_content = file.read()
158
+
159
+ # Replace all found URLs with an empty string
160
+ cleaned_content = re.sub(url_pattern, '', markdown_content)
161
+
162
+ # Write the cleaned content to a new file
163
+ with open(new_file_path, 'w', encoding='utf-8') as file:
164
+ file.write(cleaned_content)
165
+ print(f"Cleaned file saved as: {new_file_path}")
166
+ return cleaned_content
167
+
168
+
169
+ def create_dynamic_listing_model(field_names: List[str]) -> Type[BaseModel]:
170
+ """
171
+ Dynamically creates a Pydantic model based on provided fields.
172
+ field_name is a list of names of the fields to extract from the markdown.
173
+ """
174
+ # Create field definitions using aliases for Field parameters
175
+ field_definitions = {field: (str, ...) for field in field_names}
176
+ # Dynamically create the model with all field
177
+ return create_model('DynamicListingModel', **field_definitions)
178
+
179
+
180
+ def create_listings_container_model(listing_model: Type[BaseModel]) -> Type[BaseModel]:
181
+ """
182
+ Create a container model that holds a list of the given listing model.
183
+ """
184
+ return create_model('DynamicListingsContainer', listings=(List[listing_model], ...))
185
+
186
+
187
+
188
+
189
+ def trim_to_token_limit(text, model, max_tokens=120000):
190
+ encoder = tiktoken.encoding_for_model(model)
191
+ tokens = encoder.encode(text)
192
+ if len(tokens) > max_tokens:
193
+ trimmed_text = encoder.decode(tokens[:max_tokens])
194
+ return trimmed_text
195
+ return text
196
+
197
+ def generate_system_message(listing_model: BaseModel) -> str:
198
+ """
199
+ Dynamically generate a system message based on the fields in the provided listing model.
200
+ """
201
+ # Use the model_json_schema() method to introspect the Pydantic model
202
+ schema_info = listing_model.model_json_schema()
203
+
204
+ # Extract field descriptions from the schema
205
+ field_descriptions = []
206
+ for field_name, field_info in schema_info["properties"].items():
207
+ # Get the field type from the schema info
208
+ field_type = field_info["type"]
209
+ field_descriptions.append(f'"{field_name}": "{field_type}"')
210
+
211
+ # Create the JSON schema structure for the listings
212
+ schema_structure = ",\n".join(field_descriptions)
213
+
214
+ # Generate the system message dynamically
215
+ system_message = f"""
216
+ You are an intelligent text extraction and conversion assistant. Your task is to extract structured information
217
+ from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text,
218
+ with no additional commentary, explanations, or extraneous information.
219
+ You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
220
+ Please process the following text and provide the output in pure JSON format with no words before or after the JSON:
221
+ Please ensure the output strictly follows this schema:
222
+
223
+ {{
224
+ "listings": [
225
+ {{
226
+ {schema_structure}
227
+ }}
228
+ ]
229
+ }} """
230
+
231
+ return system_message
232
+
233
+
234
+
235
+ def format_data(data, DynamicListingsContainer, DynamicListingModel, selected_model):
236
+ token_counts = {}
237
+
238
+ if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]:
239
+ # Use OpenAI API
240
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
241
+ completion = client.beta.chat.completions.parse(
242
+ model=selected_model,
243
+ messages=[
244
+ {"role": "system", "content": SYSTEM_MESSAGE},
245
+ {"role": "user", "content": USER_MESSAGE + data},
246
+ ],
247
+ response_format=DynamicListingsContainer
248
+ )
249
+ # Calculate tokens using tiktoken
250
+ encoder = tiktoken.encoding_for_model(selected_model)
251
+ input_token_count = len(encoder.encode(USER_MESSAGE + data))
252
+ output_token_count = len(encoder.encode(json.dumps(completion.choices[0].message.parsed.dict())))
253
+ token_counts = {
254
+ "input_tokens": input_token_count,
255
+ "output_tokens": output_token_count
256
+ }
257
+ return completion.choices[0].message.parsed, token_counts
258
+
259
+ elif selected_model == "gemini-1.5-flash":
260
+ # Use Google Gemini API
261
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
262
+ model = genai.GenerativeModel('gemini-1.5-flash',
263
+ generation_config={
264
+ "response_mime_type": "application/json",
265
+ "response_schema": DynamicListingsContainer
266
+ })
267
+ prompt = SYSTEM_MESSAGE + "\n" + USER_MESSAGE + data
268
+ # Count input tokens using Gemini's method
269
+ input_tokens = model.count_tokens(prompt)
270
+ completion = model.generate_content(prompt)
271
+ # Extract token counts from usage_metadata
272
+ usage_metadata = completion.usage_metadata
273
+ token_counts = {
274
+ "input_tokens": usage_metadata.prompt_token_count,
275
+ "output_tokens": usage_metadata.candidates_token_count
276
+ }
277
+ return completion.text, token_counts
278
+
279
+ elif selected_model == "Llama3.1 8B":
280
+
281
+ # Dynamically generate the system message based on the schema
282
+ sys_message = generate_system_message(DynamicListingModel)
283
+ # print(SYSTEM_MESSAGE)
284
+ # Point to the local server
285
+ client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
286
+
287
+ completion = client.chat.completions.create(
288
+ model=LLAMA_MODEL_FULLNAME, #change this if needed (use a better model)
289
+ messages=[
290
+ {"role": "system", "content": sys_message},
291
+ {"role": "user", "content": USER_MESSAGE + data}
292
+ ],
293
+ temperature=0.7,
294
+
295
+ )
296
+
297
+ # Extract the content from the response
298
+ response_content = completion.choices[0].message.content
299
+ print(response_content)
300
+ # Convert the content from JSON string to a Python dictionary
301
+ parsed_response = json.loads(response_content)
302
+
303
+ # Extract token usage
304
+ token_counts = {
305
+ "input_tokens": completion.usage.prompt_tokens,
306
+ "output_tokens": completion.usage.completion_tokens
307
+ }
308
+
309
+ return parsed_response, token_counts
310
+ elif selected_model== "Groq Llama3.1 70b":
311
+
312
+ # Dynamically generate the system message based on the schema
313
+ sys_message = generate_system_message(DynamicListingModel)
314
+ # print(SYSTEM_MESSAGE)
315
+ # Point to the local server
316
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"),)
317
+
318
+ completion = client.chat.completions.create(
319
+ messages=[
320
+ {"role": "system","content": sys_message},
321
+ {"role": "user","content": USER_MESSAGE + data}
322
+ ],
323
+ model=GROQ_LLAMA_MODEL_FULLNAME,
324
+ )
325
+
326
+ # Extract the content from the response
327
+ response_content = completion.choices[0].message.content
328
+
329
+ # Convert the content from JSON string to a Python dictionary
330
+ parsed_response = json.loads(response_content)
331
+
332
+ # completion.usage
333
+ token_counts = {
334
+ "input_tokens": completion.usage.prompt_tokens,
335
+ "output_tokens": completion.usage.completion_tokens
336
+ }
337
+
338
+ return parsed_response, token_counts
339
+ else:
340
+ raise ValueError(f"Unsupported model: {selected_model}")
341
+
342
+
343
+
344
+ def save_formatted_data(formatted_data, output_folder: str, json_file_name: str, excel_file_name: str):
345
+ """Save formatted data as JSON and Excel in the specified output folder."""
346
+ os.makedirs(output_folder, exist_ok=True)
347
+
348
+ # Parse the formatted data if it's a JSON string (from Gemini API)
349
+ if isinstance(formatted_data, str):
350
+ try:
351
+ formatted_data_dict = json.loads(formatted_data)
352
+ except json.JSONDecodeError:
353
+ raise ValueError("The provided formatted data is a string but not valid JSON.")
354
+ else:
355
+ # Handle data from OpenAI or other sources
356
+ formatted_data_dict = formatted_data.dict() if hasattr(formatted_data, 'dict') else formatted_data
357
+
358
+ # Save the formatted data as JSON
359
+ json_output_path = os.path.join(output_folder, json_file_name)
360
+ with open(json_output_path, 'w', encoding='utf-8') as f:
361
+ json.dump(formatted_data_dict, f, indent=4)
362
+ print(f"Formatted data saved to JSON at {json_output_path}")
363
+
364
+ # Prepare data for DataFrame
365
+ if isinstance(formatted_data_dict, dict):
366
+ # If the data is a dictionary containing lists, assume these lists are records
367
+ data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict
368
+ elif isinstance(formatted_data_dict, list):
369
+ data_for_df = formatted_data_dict
370
+ else:
371
+ raise ValueError("Formatted data is neither a dictionary nor a list, cannot convert to DataFrame")
372
+
373
+ # Create DataFrame
374
+ try:
375
+ df = pd.DataFrame(data_for_df)
376
+ print("DataFrame created successfully.")
377
+
378
+ # Save the DataFrame to an Excel file
379
+ excel_output_path = os.path.join(output_folder, excel_file_name)
380
+ df.to_excel(excel_output_path, index=False)
381
+ print(f"Formatted data saved to Excel at {excel_output_path}")
382
+
383
+ return df
384
+ except Exception as e:
385
+ print(f"Error creating DataFrame or saving Excel: {str(e)}")
386
+ return None
387
+
388
+ def calculate_price(token_counts, model):
389
+ input_token_count = token_counts.get("input_tokens", 0)
390
+ output_token_count = token_counts.get("output_tokens", 0)
391
+
392
+ # Calculate the costs
393
+ input_cost = input_token_count * PRICING[model]["input"]
394
+ output_cost = output_token_count * PRICING[model]["output"]
395
+ total_cost = input_cost + output_cost
396
+
397
+ return input_token_count, output_token_count, total_cost
398
+
399
+
400
+ def generate_unique_folder_name(url):
401
+ timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
402
+ url_name = re.sub(r'\W+', '_', url.split('//')[1].split('/')[0]) # Extract domain name and replace non-alphanumeric characters
403
+ return f"{url_name}_{timestamp}"
404
+
405
+
406
+ def scrape_multiple_urls(urls, fields, selected_model):
407
+ output_folder = os.path.join('output', generate_unique_folder_name(urls[0]))
408
+ os.makedirs(output_folder, exist_ok=True)
409
+
410
+ total_input_tokens = 0
411
+ total_output_tokens = 0
412
+ total_cost = 0
413
+ all_data = []
414
+ markdown = None # We'll store the markdown for the first (or only) URL
415
+
416
+ for i, url in enumerate(urls, start=1):
417
+ raw_html = fetch_html_selenium(url)
418
+ current_markdown = html_to_markdown_with_readability(raw_html)
419
+ if i == 1:
420
+ markdown = current_markdown # Store markdown for the first URL
421
+
422
+ input_tokens, output_tokens, cost, formatted_data = scrape_url(url, fields, selected_model, output_folder, i, current_markdown)
423
+ total_input_tokens += input_tokens
424
+ total_output_tokens += output_tokens
425
+ total_cost += cost
426
+ all_data.append(formatted_data)
427
+
428
+ return output_folder, total_input_tokens, total_output_tokens, total_cost, all_data, markdown
429
+
430
+ def scrape_url(url: str, fields: List[str], selected_model: str, output_folder: str, file_number: int, markdown: str):
431
+ """Scrape a single URL and save the results."""
432
+ try:
433
+ # Save raw data
434
+ save_raw_data(markdown, output_folder, f'rawData_{file_number}.md')
435
+
436
+ # Create the dynamic listing model
437
+ DynamicListingModel = create_dynamic_listing_model(fields)
438
+
439
+ # Create the container model that holds a list of the dynamic listing models
440
+ DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
441
+
442
+ # Format data
443
+ formatted_data, token_counts = format_data(markdown, DynamicListingsContainer, DynamicListingModel, selected_model)
444
+
445
+ # Save formatted data
446
+ save_formatted_data(formatted_data, output_folder, f'sorted_data_{file_number}.json', f'sorted_data_{file_number}.xlsx')
447
+
448
+ # Calculate and return token usage and cost
449
+ input_tokens, output_tokens, total_cost = calculate_price(token_counts, selected_model)
450
+ return input_tokens, output_tokens, total_cost, formatted_data
451
+
452
+ except Exception as e:
453
+ print(f"An error occurred while processing {url}: {e}")
454
+ return 0, 0, 0, None
455
+
456
+
457
+
458