supunTE commited on
Commit
112a5ac
·
1 Parent(s): 6005655

include output

Browse files
Files changed (1) hide show
  1. scrape-content.ipynb +204 -109
scrape-content.ipynb CHANGED
@@ -174,8 +174,8 @@
174
  {
175
  "metadata": {
176
  "ExecuteTime": {
177
- "end_time": "2024-10-25T10:52:34.470225Z",
178
- "start_time": "2024-10-25T10:52:34.458243Z"
179
  }
180
  },
181
  "cell_type": "code",
@@ -234,51 +234,14 @@
234
  " return False\n",
235
  "\n",
236
  "\n",
237
- "def print_results_with_content(element_list):\n",
238
  " \"\"\"\n",
239
- " Print formatted results including the inner content of elements.\n",
240
  " \"\"\"\n",
241
- " print(\"\\nElements Containing Most Images (Lowest Level for Each Count):\")\n",
242
- " print(\"=\" * 100)\n",
243
- "\n",
244
- " for rank, (tag_info, count, element) in enumerate(element_list, 1):\n",
245
- " print(f\"\\nRank {rank}:\")\n",
246
- " print(\"-\" * 100)\n",
247
- " print(f\"Element: {tag_info}\")\n",
248
- " print(f\"Image Count: {count}\")\n",
249
- " print(\"\\nContent Preview:\")\n",
250
- " print(\"-\" * 100)\n",
251
- "\n",
252
- " # Get all immediate img tags\n",
253
- " immediate_images = element.find_all('img', recursive=False)\n",
254
- " nested_images = element.find_all('img', recursive=True)\n",
255
- "\n",
256
- " print(f\"Direct images: {len(immediate_images)}\")\n",
257
- " print(f\"Total images (including nested): {len(nested_images)}\")\n",
258
- " print(\"\\nImage sources:\")\n",
259
- "\n",
260
- " # Print image sources and alt text\n",
261
- " for img in nested_images:\n",
262
- " src = img.get('src', 'No source')\n",
263
- " alt = img.get('alt', 'No alt text')\n",
264
- " print(f\"- Source: {src}\")\n",
265
- " print(f\" Alt text: {alt}\")\n",
266
- "\n",
267
- " print(\"\\nFull HTML structure:\")\n",
268
- " print(\"-\" * 100)\n",
269
- " # Print formatted HTML structure\n",
270
- " html_content = element.prettify()\n",
271
- " print(html_content)\n",
272
- " print(\"=\" * 100)\n",
273
- "\n",
274
- "\n",
275
- "def find_top_image_parent(soup, base_url):\n",
276
- " \"\"\"\n",
277
- " Find the element containing the most images at the lowest level and return its details as JSON.\n",
278
- " \"\"\"\n",
279
- " # Collect all elements with their image counts\n",
280
  " soup = convert_relative_urls(soup, base_url)\n",
281
  "\n",
 
282
  " elements_with_counts = []\n",
283
  " for element in soup.find_all():\n",
284
  " if element.name != 'img': # Skip img tags themselves\n",
@@ -290,67 +253,95 @@
290
  " elements_with_counts.sort(key=lambda x: x[1], reverse=True)\n",
291
  "\n",
292
  " if not elements_with_counts:\n",
293
- " return json.dumps({\"error\": \"No elements with images found\"}, indent=2)\n",
294
  "\n",
 
 
295
  " max_count = elements_with_counts[0][1]\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  "\n",
297
- " # Get all elements with max count\n",
298
- " top_elements = [(elem, count) for elem, count in elements_with_counts if count == max_count]\n",
299
- " print(len(elements_with_counts))\n",
300
- " # \n",
301
- " # # Find the lowest-level element among those with max count\n",
302
- " # top_element = None\n",
303
- " # for element, count in top_elements:\n",
304
- " # if not has_child_with_same_count(element, count, elements_with_counts):\n",
305
- " # top_element = element\n",
306
- " # break\n",
307
- " # \n",
308
- " # if not top_element:\n",
309
- " # return json.dumps({\"error\": \"No suitable element found\"}, indent=2)\n",
310
- " # \n",
311
- " # # Collect all images within the element\n",
312
- " # images = []\n",
313
- " # for img in top_element.find_all('img', recursive=True):\n",
314
- " # image_data = {\n",
315
- " # \"src\": img.get('src', 'No source'),\n",
316
- " # \"alt\": img.get('alt', 'No alt text')\n",
317
- " # }\n",
318
- " # # Add any other attributes that exist\n",
319
- " # for attr in ['title', 'width', 'height', 'class']:\n",
320
- " # if img.get(attr):\n",
321
- " # image_data[attr] = img[attr]\n",
322
- " # images.append(image_data)\n",
323
- " # \n",
324
- " # # Create result dictionary\n",
325
- " # result = {\n",
326
- " # \"element\": {\n",
327
- " # \"tag\": top_element.name,\n",
328
- " # \"identifier\": get_element_identifier(top_element),\n",
329
- " # \"classes\": top_element.get('class', []),\n",
330
- " # \"id\": top_element.get('id', None)\n",
331
- " # },\n",
332
- " # \"image_count\": max_count,\n",
333
- " # \"images\": images,\n",
334
- " # \"html_content\": str(top_element)\n",
335
- " # }\n",
336
- " # \n",
337
- " # # Create styled HTML output\n",
338
- " # style_tag = f\"\"\"\n",
339
- " # <style>\n",
340
- " # img {{\n",
341
- " # width: 300px;\n",
342
- " # height: 300px;\n",
343
- " # object-fit: contain;\n",
344
- " # }}\n",
345
- " # </style>\n",
346
- " # \"\"\"\n",
347
- " # html_output = style_tag + str(top_element)\n",
348
- " # \n",
349
- " # return json.dumps(result, indent=2), html_output\n"
350
  ],
351
  "id": "3830f2e224e84798",
352
  "outputs": [],
353
- "execution_count": 33
354
  },
355
  {
356
  "metadata": {},
@@ -361,20 +352,23 @@
361
  {
362
  "metadata": {
363
  "ExecuteTime": {
364
- "end_time": "2024-10-25T10:52:36.684418Z",
365
- "start_time": "2024-10-25T10:52:36.614623Z"
366
  }
367
  },
368
  "cell_type": "code",
369
  "source": [
370
  "base_url = products_url.rsplit('/', 1)[0]\n",
371
- "find_top_image_parent(soup, base_url)\n",
372
- "#\n",
373
- "# with open(\"output.json\", \"w\") as file:\n",
374
- "# file.write(json_data)\n",
375
- "# \n",
376
- "# with open(\"output.html\", \"w\") as file:\n",
377
- "# file.write(html_content)"
 
 
 
378
  ],
379
  "id": "20b0b8cd238de02d",
380
  "outputs": [
@@ -382,11 +376,112 @@
382
  "name": "stdout",
383
  "output_type": "stream",
384
  "text": [
385
- "411\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  ]
387
  }
388
  ],
389
- "execution_count": 34
390
  }
391
  ],
392
  "metadata": {
 
174
  {
175
  "metadata": {
176
  "ExecuteTime": {
177
+ "end_time": "2024-10-25T12:57:47.980010Z",
178
+ "start_time": "2024-10-25T12:57:47.962944Z"
179
  }
180
  },
181
  "cell_type": "code",
 
234
  " return False\n",
235
  "\n",
236
  "\n",
237
+ "def find_image_rich_parents(soup, base_url):\n",
238
  " \"\"\"\n",
239
+ " Find elements containing images and return both sorted list and detailed top element info.\n",
240
  " \"\"\"\n",
241
+ " # Convert relative URLs to absolute\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  " soup = convert_relative_urls(soup, base_url)\n",
243
  "\n",
244
+ " # Collect all elements with their image counts\n",
245
  " elements_with_counts = []\n",
246
  " for element in soup.find_all():\n",
247
  " if element.name != 'img': # Skip img tags themselves\n",
 
253
  " elements_with_counts.sort(key=lambda x: x[1], reverse=True)\n",
254
  "\n",
255
  " if not elements_with_counts:\n",
256
+ " return [], {\"error\": \"No elements with images found\"}, \"\"\n",
257
  "\n",
258
+ " # Process elements for sorted list\n",
259
+ " sorted_elements = []\n",
260
  " max_count = elements_with_counts[0][1]\n",
261
+ " current_count = max_count\n",
262
+ "\n",
263
+ " while current_count > 0 and len(sorted_elements) < 100:\n",
264
+ " # Get all elements with current count\n",
265
+ " current_elements = [(elem, count) for elem, count in elements_with_counts if count == current_count]\n",
266
+ "\n",
267
+ " # Filter out elements that have children with the same count\n",
268
+ " for element, count in current_elements:\n",
269
+ " if not has_child_with_same_count(element, count, elements_with_counts):\n",
270
+ " sorted_elements.append((get_element_identifier(element), count))\n",
271
+ "\n",
272
+ " # Move to next highest count\n",
273
+ " remaining_counts = [count for _, count in elements_with_counts if count < current_count]\n",
274
+ " current_count = max(remaining_counts) if remaining_counts else 0\n",
275
+ "\n",
276
+ " # Get detailed info for top element\n",
277
+ " top_element = elements_with_counts[0][0]\n",
278
+ " max_count = elements_with_counts[0][1]\n",
279
+ "\n",
280
+ " # Find the lowest-level element among those with max count\n",
281
+ " for element, count in elements_with_counts:\n",
282
+ " if count == max_count and not has_child_with_same_count(element, count, elements_with_counts):\n",
283
+ " top_element = element\n",
284
+ " break\n",
285
+ "\n",
286
+ " # Collect all images within the top element\n",
287
+ " images = []\n",
288
+ " for img in top_element.find_all('img', recursive=True):\n",
289
+ " image_data = {\n",
290
+ " \"src\": img.get('src', 'No source'),\n",
291
+ " \"alt\": img.get('alt', 'No alt text')\n",
292
+ " }\n",
293
+ " for attr in ['title', 'width', 'height', 'class']:\n",
294
+ " if img.get(attr):\n",
295
+ " image_data[attr] = img[attr]\n",
296
+ " images.append(image_data)\n",
297
+ "\n",
298
+ " # Create result dictionary for top element\n",
299
+ " top_element_info = {\n",
300
+ " \"element\": {\n",
301
+ " \"tag\": top_element.name,\n",
302
+ " \"identifier\": get_element_identifier(top_element),\n",
303
+ " \"classes\": top_element.get('class', []),\n",
304
+ " \"id\": top_element.get('id', None)\n",
305
+ " },\n",
306
+ " \"image_count\": max_count,\n",
307
+ " \"images\": images,\n",
308
+ " \"html_content\": str(top_element)\n",
309
+ " }\n",
310
+ "\n",
311
+ " # Create styled HTML output\n",
312
+ " style_tag = \"\"\"\n",
313
+ " <style>\n",
314
+ " img {\n",
315
+ " width: 300px;\n",
316
+ " height: 300px;\n",
317
+ " object-fit: contain;\n",
318
+ " }\n",
319
+ " </style>\n",
320
+ " \"\"\"\n",
321
+ " html_output = style_tag + str(top_element)\n",
322
  "\n",
323
+ " return sorted_elements, top_element_info, html_output\n",
324
+ "\n",
325
+ "\n",
326
+ "def print_results(element_list):\n",
327
+ " \"\"\"\n",
328
+ " Print formatted results.\n",
329
+ " \"\"\"\n",
330
+ " print(\"\\nElements Containing Most Images (Lowest Level for Each Count):\")\n",
331
+ " print(\"-\" * 70)\n",
332
+ " print(\"Rank Element Tag & Classes Image Count\")\n",
333
+ " print(\"-\" * 70)\n",
334
+ "\n",
335
+ " for rank, element in enumerate(element_list, 1):\n",
336
+ " tag_info, count = element\n",
337
+ " rank_str = f\"{rank}.\"\n",
338
+ " rank_str = rank_str.ljust(5)\n",
339
+ " tag_info_padded = tag_info.ljust(45)\n",
340
+ " print(f\"{rank_str} {tag_info_padded} {count}\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  ],
342
  "id": "3830f2e224e84798",
343
  "outputs": [],
344
+ "execution_count": 46
345
  },
346
  {
347
  "metadata": {},
 
352
  {
353
  "metadata": {
354
  "ExecuteTime": {
355
+ "end_time": "2024-10-25T13:00:02.064761Z",
356
+ "start_time": "2024-10-25T13:00:01.277970Z"
357
  }
358
  },
359
  "cell_type": "code",
360
  "source": [
361
  "base_url = products_url.rsplit('/', 1)[0]\n",
362
+ "sorted_elements, top_element_info, html_output = find_image_rich_parents(soup, base_url)\n",
363
+ "\n",
364
+ "# Print sorted list\n",
365
+ "print_results(sorted_elements)\n",
366
+ "\n",
367
+ "with open(\"output1.json\", \"w\") as file:\n",
368
+ " file.write(json.dumps(top_element_info, indent=2))\n",
369
+ "\n",
370
+ "with open(\"output1.html\", \"w\") as file:\n",
371
+ " file.write(html_output)"
372
  ],
373
  "id": "20b0b8cd238de02d",
374
  "outputs": [
 
376
  "name": "stdout",
377
  "output_type": "stream",
378
  "text": [
379
+ "\n",
380
+ "Elements Containing Most Images (Lowest Level for Each Count):\n",
381
+ "----------------------------------------------------------------------\n",
382
+ "Rank Element Tag & Classes Image Count\n",
383
+ "----------------------------------------------------------------------\n",
384
+ "1. div 63\n",
385
+ "2. div .sc-5da3fdcc-0 .cqdDWw 58\n",
386
+ "3. div 5\n",
387
+ "4. div .l-container-fixed .h-padding-h-x6 .h-padding-t-x6 .h-display-flex .h-flex-direction-row .h-flex-justify-space-between 4\n",
388
+ "5. div 3\n",
389
+ "6. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
390
+ "7. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
391
+ "8. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
392
+ "9. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
393
+ "10. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
394
+ "11. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
395
+ "12. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
396
+ "13. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
397
+ "14. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
398
+ "15. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
399
+ "16. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
400
+ "17. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
401
+ "18. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
402
+ "19. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
403
+ "20. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
404
+ "21. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
405
+ "22. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
406
+ "23. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
407
+ "24. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
408
+ "25. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
409
+ "26. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
410
+ "27. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
411
+ "28. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
412
+ "29. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
413
+ "30. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
414
+ "31. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
415
+ "32. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
416
+ "33. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
417
+ "34. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
418
+ "35. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
419
+ "36. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
420
+ "37. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
421
+ "38. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
422
+ "39. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
423
+ "40. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
424
+ "41. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
425
+ "42. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
426
+ "43. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
427
+ "44. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
428
+ "45. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
429
+ "46. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
430
+ "47. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
431
+ "48. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
432
+ "49. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
433
+ "50. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
434
+ "51. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
435
+ "52. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
436
+ "53. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
437
+ "54. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
438
+ "55. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
439
+ "56. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
440
+ "57. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
441
+ "58. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
442
+ "59. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
443
+ "60. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
444
+ "61. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
445
+ "62. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
446
+ "63. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
447
+ "64. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
448
+ "65. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
449
+ "66. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
450
+ "67. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
451
+ "68. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
452
+ "69. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
453
+ "70. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
454
+ "71. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
455
+ "72. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
456
+ "73. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
457
+ "74. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
458
+ "75. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
459
+ "76. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
460
+ "77. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
461
+ "78. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
462
+ "79. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
463
+ "80. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
464
+ "81. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
465
+ "82. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
466
+ "83. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
467
+ "84. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
468
+ "85. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
469
+ "86. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
470
+ "87. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
471
+ "88. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
472
+ "89. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
473
+ "90. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
474
+ "91. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
475
+ "92. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
476
+ "93. span 1\n",
477
+ "94. span 1\n",
478
+ "95. span 1\n",
479
+ "96. span 1\n",
480
+ "97. div .Illustration_IllustrationWrapper__xJP5g 1\n"
481
  ]
482
  }
483
  ],
484
+ "execution_count": 49
485
  }
486
  ],
487
  "metadata": {