In [9]:
from fake_headers import Headers

headers = Headers(headers=True).generate()
headers

{'Accept': '*/*',
 'Connection': 'keep-alive',
 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6; rv:61.0) Gecko/20100101 Firefox/61.0',
 'Cache-Control': 'max-age=0',
 'Upgrade-Insecure-Requests': '1',
 'Referer': 'https://google.com'}

In [10]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time


def scroll_and_wait(driver, scroll_pause_time=2):
 """
 Scroll the page gradually and wait for images to load
 """
 # Get scroll height
 last_height = driver.execute_script("return document.body.scrollHeight")

 while True:
 # Scroll down gradually
 for i in range(10):
 driver.execute_script(f"window.scrollTo(0, {(i + 1) * (last_height / 10)});")
 time.sleep(0.5) # Short pause between each scroll step

 # Wait for new images to load
 time.sleep(scroll_pause_time)

 # Calculate new scroll height and compare with last scroll height
 new_height = driver.execute_script("return document.body.scrollHeight")
 if new_height == last_height:
 break
 last_height = new_height


def wait_for_images(driver, timeout=10):
 """
 Wait for images to load and become visible
 """
 try:
 # Wait for all image elements to be present
 WebDriverWait(driver, timeout).until(
 EC.presence_of_all_elements_located((By.TAG_NAME, "img"))
 )

 # Get all image elements
 images = driver.find_elements(By.TAG_NAME, "img")

 # Wait for images to load
 for img in images:
 try:
 WebDriverWait(driver, 2).until(
 lambda d: img.get_attribute('complete') == 'true' and
 img.get_attribute('naturalHeight') != '0'
 )
 except:
 continue # Skip images that don't load within timeout

 except Exception as e:
 print(f"Warning: Not all images could be loaded: {e}")

In [11]:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Add fake headers
for key, value in headers.items():
 chrome_options.add_argument(f'--{key.lower()}={value}')

# Additional configurations to appear more human-like
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--window-size=1920,1080")

# Enable images in headless mode
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--high-dpi-support=1")

# Privacy and fingerprinting prevention
chrome_options.add_argument("--disable-blink-features")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Enable JavaScript
chrome_options.add_argument("--enable-javascript")

driver = webdriver.Chrome(options=chrome_options)

driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
 "source": """
 Object.defineProperty(navigator, 'webdriver', {
 get: () => undefined
 })
 """
})

products_url = "https://www.target.com/s?searchTerm=Peach&tref=typeahead%7Cterm%7CPeach%7C%7C%7Chistory"
driver.get(products_url)

time.sleep(3)

# Scroll and wait for content
scroll_and_wait(driver)

# Wait for images to load
wait_for_images(driver)

time.sleep(2)

soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [33]:
from urllib.parse import urljoin
import json


def convert_relative_urls(soup, base_url):
 """
 Convert all relative URLs in the soup object to absolute URLs.
 Handles href, src, and data-src attributes.
 """
 # Convert href attributes (links)
 for tag in soup.find_all(href=True):
 tag['href'] = urljoin(base_url, tag['href'])

 # Convert src attributes (images, scripts, etc.)
 for tag in soup.find_all(src=True):
 tag['src'] = urljoin(base_url, tag['src'])

 # Convert data-src attributes (lazy loaded images)
 for tag in soup.find_all(attrs={'data-src': True}):
 tag['data-src'] = urljoin(base_url, tag['data-src'])

 return soup


def count_images_in_element(element):
 """
 Count all images within an element, including nested ones.
 """
 return len(element.find_all('img', recursive=True))


def get_element_identifier(element):
 """
 Create a unique identifier for an element including tag and classes.
 """
 identifier = element.name
 if element.get('class'):
 identifier += f" .{' .'.join(element['class'])}"
 if element.get('id'):
 identifier += f" #{element['id']}"
 return identifier


def has_child_with_same_count(element, image_count, all_elements_with_counts):
 """
 Check if the element has any child with the same image count.
 """
 for other_element, other_count in all_elements_with_counts:
 if other_count == image_count and other_element != element:
 if any(parent == element for parent in other_element.parents):
 return True
 return False


def print_results_with_content(element_list):
 """
 Print formatted results including the inner content of elements.
 """
 print("\nElements Containing Most Images (Lowest Level for Each Count):")
 print("=" * 100)

 for rank, (tag_info, count, element) in enumerate(element_list, 1):
 print(f"\nRank {rank}:")
 print("-" * 100)
 print(f"Element: {tag_info}")
 print(f"Image Count: {count}")
 print("\nContent Preview:")
 print("-" * 100)

 # Get all immediate img tags
 immediate_images = element.find_all('img', recursive=False)
 nested_images = element.find_all('img', recursive=True)

 print(f"Direct images: {len(immediate_images)}")
 print(f"Total images (including nested): {len(nested_images)}")
 print("\nImage sources:")

 # Print image sources and alt text
 for img in nested_images:
 src = img.get('src', 'No source')
 alt = img.get('alt', 'No alt text')
 print(f"- Source: {src}")
 print(f" Alt text: {alt}")

 print("\nFull HTML structure:")
 print("-" * 100)
 # Print formatted HTML structure
 html_content = element.prettify()
 print(html_content)
 print("=" * 100)


def find_top_image_parent(soup, base_url):
 """
 Find the element containing the most images at the lowest level and return its details as JSON.
 """
 # Collect all elements with their image counts
 soup = convert_relative_urls(soup, base_url)

 elements_with_counts = []
 for element in soup.find_all():
 if element.name != 'img': # Skip img tags themselves
 image_count = count_images_in_element(element)
 if image_count > 0:
 elements_with_counts.append((element, image_count))

 # Sort by image count in descending order
 elements_with_counts.sort(key=lambda x: x[1], reverse=True)

 if not elements_with_counts:
 return json.dumps({"error": "No elements with images found"}, indent=2)

 max_count = elements_with_counts[0][1]

 # Get all elements with max count
 top_elements = [(elem, count) for elem, count in elements_with_counts if count == max_count]
 print(len(elements_with_counts))
 # 
 # # Find the lowest-level element among those with max count
 # top_element = None
 # for element, count in top_elements:
 # if not has_child_with_same_count(element, count, elements_with_counts):
 # top_element = element
 # break
 # 
 # if not top_element:
 # return json.dumps({"error": "No suitable element found"}, indent=2)
 # 
 # # Collect all images within the element
 # images = []
 # for img in top_element.find_all('img', recursive=True):
 # image_data = {
 # "src": img.get('src', 'No source'),
 # "alt": img.get('alt', 'No alt text')
 # }
 # # Add any other attributes that exist
 # for attr in ['title', 'width', 'height', 'class']:
 # if img.get(attr):
 # image_data[attr] = img[attr]
 # images.append(image_data)
 # 
 # # Create result dictionary
 # result = {
 # "element": {
 # "tag": top_element.name,
 # "identifier": get_element_identifier(top_element),
 # "classes": top_element.get('class', []),
 # "id": top_element.get('id', None)
 # },
 # "image_count": max_count,
 # "images": images,
 # "html_content": str(top_element)
 # }
 # 
 # # Create styled HTML output
 # style_tag = f"""
 # 
 # """
 # html_output = style_tag + str(top_element)
 # 
 # return json.dumps(result, indent=2), html_output


In [34]:
base_url = products_url.rsplit('/', 1)[0]
find_top_image_parent(soup, base_url)
#
# with open("output.json", "w") as file:
# file.write(json_data)
# 
# with open("output.html", "w") as file:
# file.write(html_content)

411
