File size: 2,189 Bytes
f2fe0c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#
#
# def is_likely_product_card(element, min_text_length=10):
#     """
#     Determine if an element is likely to be a product card based on various heuristics.
#     """
#     # 1. Check for common product card class/id patterns
#     identifier = element.get('class', []) + [element.get('id', '')]
#     product_patterns = ['product', 'item', 'card', 'goods', 'listing']
#     if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier):
#         return True
#
#     # 2. Check for price patterns
#     text_content = element.get_text()
#     price_patterns = [
#         r'\$\d+\.?\d*',  # USD
#         r'£\d+\.?\d*',  # GBP
#         r'€\d+\.?\d*',  # EUR
#         r'\d+\.?\d*\s*USD',
#         r'\d+\.?\d*\s*EUR'
#     ]
#     if any(re.search(pattern, text_content) for pattern in price_patterns):
#         return True
#
#     # 3. Check for minimum text content (excluding whitespace)
#     clean_text = ' '.join(text_content.split())
#     if len(clean_text) < min_text_length:
#         return False
#
#     # 4. Check for typical product card elements
#     has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
#
#     return has_title
#
#
# def should_exclude_element(element):
#     # """
#     # Check if an element should be excluded from consideration.
#     # """
#
#     # 1. Exclude common non-product sections
#     exclude_patterns = [
#         'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart',
#         'search', 'pagination', 'sort', 'banner', 'ad', 'slider'
#     ]
#
#     # Check class and id
#     element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split()
#     element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split()
#
#     print(element_classes)
#
#     for pattern in exclude_patterns:
#         if pattern in element_classes:
#             print(f"Excluded element due to class containing '{pattern}'")
#             return True
#         if pattern in element_id:
#             print(f"Excluded element due to id containing '{pattern}'")
#             return True
#
#     return False