{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2024-10-25T10:32:47.963356Z", "start_time": "2024-10-25T10:32:47.950533Z" } }, "cell_type": "code", "source": [ "from fake_headers import Headers\n", "\n", "headers = Headers(headers=True).generate()\n", "headers" ], "id": "c60b4d771c2e0a21", "outputs": [ { "data": { "text/plain": [ "{'Accept': '*/*',\n", " 'Connection': 'keep-alive',\n", " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6; rv:61.0) Gecko/20100101 Firefox/61.0',\n", " 'Cache-Control': 'max-age=0',\n", " 'Upgrade-Insecure-Requests': '1',\n", " 'Referer': 'https://google.com'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 9 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-25T10:32:49.821005Z", "start_time": "2024-10-25T10:32:49.798988Z" } }, "cell_type": "code", "source": [ "from selenium.webdriver.chrome.options import Options\n", "from selenium import webdriver\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.common.by import By\n", "from bs4 import BeautifulSoup\n", "import time\n", "\n", "\n", "def scroll_and_wait(driver, scroll_pause_time=2):\n", " \"\"\"\n", " Scroll the page gradually and wait for images to load\n", " \"\"\"\n", " # Get scroll height\n", " last_height = driver.execute_script(\"return document.body.scrollHeight\")\n", "\n", " while True:\n", " # Scroll down gradually\n", " for i in range(10):\n", " driver.execute_script(f\"window.scrollTo(0, {(i + 1) * (last_height / 10)});\")\n", " time.sleep(0.5) # Short pause between each scroll step\n", "\n", " # Wait for new images to load\n", " time.sleep(scroll_pause_time)\n", "\n", " # Calculate new scroll height and compare with last scroll height\n", " new_height = driver.execute_script(\"return document.body.scrollHeight\")\n", " if new_height == last_height:\n", " break\n", " last_height = new_height\n", "\n", "\n", "def wait_for_images(driver, timeout=10):\n", " \"\"\"\n", " Wait for images to load and become visible\n", " \"\"\"\n", " try:\n", " # Wait for all image elements to be present\n", " WebDriverWait(driver, timeout).until(\n", " EC.presence_of_all_elements_located((By.TAG_NAME, \"img\"))\n", " )\n", "\n", " # Get all image elements\n", " images = driver.find_elements(By.TAG_NAME, \"img\")\n", "\n", " # Wait for images to load\n", " for img in images:\n", " try:\n", " WebDriverWait(driver, 2).until(\n", " lambda d: img.get_attribute('complete') == 'true' and\n", " img.get_attribute('naturalHeight') != '0'\n", " )\n", " except:\n", " continue # Skip images that don't load within timeout\n", "\n", " except Exception as e:\n", " print(f\"Warning: Not all images could be loaded: {e}\")" ], "id": "11933d956e20b6b8", "outputs": [], "execution_count": 10 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-25T10:33:23.469518Z", "start_time": "2024-10-25T10:32:53.382666Z" } }, "cell_type": "code", "source": [ "chrome_options = Options()\n", "chrome_options.add_argument(\"--headless\")\n", "chrome_options.add_argument(\"--disable-gpu\")\n", "chrome_options.add_argument(\"--no-sandbox\")\n", "chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", "\n", "# Add fake headers\n", "for key, value in headers.items():\n", " chrome_options.add_argument(f'--{key.lower()}={value}')\n", "\n", "# Additional configurations to appear more human-like\n", "chrome_options.add_argument(\"--disable-blink-features=AutomationControlled\")\n", "chrome_options.add_argument(\"--window-size=1920,1080\")\n", "\n", "# Enable images in headless mode\n", "chrome_options.add_argument(\"--force-device-scale-factor=1\")\n", "chrome_options.add_argument(\"--high-dpi-support=1\")\n", "\n", "# Privacy and fingerprinting prevention\n", "chrome_options.add_argument(\"--disable-blink-features\")\n", "chrome_options.add_argument(\"--disable-infobars\")\n", "chrome_options.add_experimental_option(\"excludeSwitches\", [\"enable-automation\"])\n", "chrome_options.add_experimental_option(\"useAutomationExtension\", False)\n", "\n", "# Enable JavaScript\n", "chrome_options.add_argument(\"--enable-javascript\")\n", "\n", "driver = webdriver.Chrome(options=chrome_options)\n", "\n", "driver.execute_cdp_cmd(\"Page.addScriptToEvaluateOnNewDocument\", {\n", " \"source\": \"\"\"\n", " Object.defineProperty(navigator, 'webdriver', {\n", " get: () => undefined\n", " })\n", " \"\"\"\n", "})\n", "\n", "products_url = \"https://www.target.com/s?searchTerm=Peach&tref=typeahead%7Cterm%7CPeach%7C%7C%7Chistory\"\n", "driver.get(products_url)\n", "\n", "time.sleep(3)\n", "\n", "# Scroll and wait for content\n", "scroll_and_wait(driver)\n", "\n", "# Wait for images to load\n", "wait_for_images(driver)\n", "\n", "time.sleep(2)\n", "\n", "soup = BeautifulSoup(driver.page_source, \"html.parser\")\n", "driver.quit()" ], "id": "ac14cff825f0887f", "outputs": [], "execution_count": 11 }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-25T10:52:34.470225Z", "start_time": "2024-10-25T10:52:34.458243Z" } }, "cell_type": "code", "source": [ "from urllib.parse import urljoin\n", "import json\n", "\n", "\n", "def convert_relative_urls(soup, base_url):\n", " \"\"\"\n", " Convert all relative URLs in the soup object to absolute URLs.\n", " Handles href, src, and data-src attributes.\n", " \"\"\"\n", " # Convert href attributes (links)\n", " for tag in soup.find_all(href=True):\n", " tag['href'] = urljoin(base_url, tag['href'])\n", "\n", " # Convert src attributes (images, scripts, etc.)\n", " for tag in soup.find_all(src=True):\n", " tag['src'] = urljoin(base_url, tag['src'])\n", "\n", " # Convert data-src attributes (lazy loaded images)\n", " for tag in soup.find_all(attrs={'data-src': True}):\n", " tag['data-src'] = urljoin(base_url, tag['data-src'])\n", "\n", " return soup\n", "\n", "\n", "def count_images_in_element(element):\n", " \"\"\"\n", " Count all images within an element, including nested ones.\n", " \"\"\"\n", " return len(element.find_all('img', recursive=True))\n", "\n", "\n", "def get_element_identifier(element):\n", " \"\"\"\n", " Create a unique identifier for an element including tag and classes.\n", " \"\"\"\n", " identifier = element.name\n", " if element.get('class'):\n", " identifier += f\" .{' .'.join(element['class'])}\"\n", " if element.get('id'):\n", " identifier += f\" #{element['id']}\"\n", " return identifier\n", "\n", "\n", "def has_child_with_same_count(element, image_count, all_elements_with_counts):\n", " \"\"\"\n", " Check if the element has any child with the same image count.\n", " \"\"\"\n", " for other_element, other_count in all_elements_with_counts:\n", " if other_count == image_count and other_element != element:\n", " if any(parent == element for parent in other_element.parents):\n", " return True\n", " return False\n", "\n", "\n", "def print_results_with_content(element_list):\n", " \"\"\"\n", " Print formatted results including the inner content of elements.\n", " \"\"\"\n", " print(\"\\nElements Containing Most Images (Lowest Level for Each Count):\")\n", " print(\"=\" * 100)\n", "\n", " for rank, (tag_info, count, element) in enumerate(element_list, 1):\n", " print(f\"\\nRank {rank}:\")\n", " print(\"-\" * 100)\n", " print(f\"Element: {tag_info}\")\n", " print(f\"Image Count: {count}\")\n", " print(\"\\nContent Preview:\")\n", " print(\"-\" * 100)\n", "\n", " # Get all immediate img tags\n", " immediate_images = element.find_all('img', recursive=False)\n", " nested_images = element.find_all('img', recursive=True)\n", "\n", " print(f\"Direct images: {len(immediate_images)}\")\n", " print(f\"Total images (including nested): {len(nested_images)}\")\n", " print(\"\\nImage sources:\")\n", "\n", " # Print image sources and alt text\n", " for img in nested_images:\n", " src = img.get('src', 'No source')\n", " alt = img.get('alt', 'No alt text')\n", " print(f\"- Source: {src}\")\n", " print(f\" Alt text: {alt}\")\n", "\n", " print(\"\\nFull HTML structure:\")\n", " print(\"-\" * 100)\n", " # Print formatted HTML structure\n", " html_content = element.prettify()\n", " print(html_content)\n", " print(\"=\" * 100)\n", "\n", "\n", "def find_top_image_parent(soup, base_url):\n", " \"\"\"\n", " Find the element containing the most images at the lowest level and return its details as JSON.\n", " \"\"\"\n", " # Collect all elements with their image counts\n", " soup = convert_relative_urls(soup, base_url)\n", "\n", " elements_with_counts = []\n", " for element in soup.find_all():\n", " if element.name != 'img': # Skip img tags themselves\n", " image_count = count_images_in_element(element)\n", " if image_count > 0:\n", " elements_with_counts.append((element, image_count))\n", "\n", " # Sort by image count in descending order\n", " elements_with_counts.sort(key=lambda x: x[1], reverse=True)\n", "\n", " if not elements_with_counts:\n", " return json.dumps({\"error\": \"No elements with images found\"}, indent=2)\n", "\n", " max_count = elements_with_counts[0][1]\n", "\n", " # Get all elements with max count\n", " top_elements = [(elem, count) for elem, count in elements_with_counts if count == max_count]\n", " print(len(elements_with_counts))\n", " # \n", " # # Find the lowest-level element among those with max count\n", " # top_element = None\n", " # for element, count in top_elements:\n", " # if not has_child_with_same_count(element, count, elements_with_counts):\n", " # top_element = element\n", " # break\n", " # \n", " # if not top_element:\n", " # return json.dumps({\"error\": \"No suitable element found\"}, indent=2)\n", " # \n", " # # Collect all images within the element\n", " # images = []\n", " # for img in top_element.find_all('img', recursive=True):\n", " # image_data = {\n", " # \"src\": img.get('src', 'No source'),\n", " # \"alt\": img.get('alt', 'No alt text')\n", " # }\n", " # # Add any other attributes that exist\n", " # for attr in ['title', 'width', 'height', 'class']:\n", " # if img.get(attr):\n", " # image_data[attr] = img[attr]\n", " # images.append(image_data)\n", " # \n", " # # Create result dictionary\n", " # result = {\n", " # \"element\": {\n", " # \"tag\": top_element.name,\n", " # \"identifier\": get_element_identifier(top_element),\n", " # \"classes\": top_element.get('class', []),\n", " # \"id\": top_element.get('id', None)\n", " # },\n", " # \"image_count\": max_count,\n", " # \"images\": images,\n", " # \"html_content\": str(top_element)\n", " # }\n", " # \n", " # # Create styled HTML output\n", " # style_tag = f\"\"\"\n", " # \n", " # \"\"\"\n", " # html_output = style_tag + str(top_element)\n", " # \n", " # return json.dumps(result, indent=2), html_output\n" ], "id": "3830f2e224e84798", "outputs": [], "execution_count": 33 }, { "metadata": {}, "cell_type": "markdown", "source": "", "id": "80fa7f140d4da0a2" }, { "metadata": { "ExecuteTime": { "end_time": "2024-10-25T10:52:36.684418Z", "start_time": "2024-10-25T10:52:36.614623Z" } }, "cell_type": "code", "source": [ "base_url = products_url.rsplit('/', 1)[0]\n", "find_top_image_parent(soup, base_url)\n", "#\n", "# with open(\"output.json\", \"w\") as file:\n", "# file.write(json_data)\n", "# \n", "# with open(\"output.html\", \"w\") as file:\n", "# file.write(html_content)" ], "id": "20b0b8cd238de02d", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "411\n" ] } ], "execution_count": 34 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }