# # # def is_likely_product_card(element, min_text_length=10): # """ # Determine if an element is likely to be a product card based on various heuristics. # """ # # 1. Check for common product card class/id patterns # identifier = element.get('class', []) + [element.get('id', '')] # product_patterns = ['product', 'item', 'card', 'goods', 'listing'] # if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier): # return True # # # 2. Check for price patterns # text_content = element.get_text() # price_patterns = [ # r'\$\d+\.?\d*', # USD # r'£\d+\.?\d*', # GBP # r'€\d+\.?\d*', # EUR # r'\d+\.?\d*\s*USD', # r'\d+\.?\d*\s*EUR' # ] # if any(re.search(pattern, text_content) for pattern in price_patterns): # return True # # # 3. Check for minimum text content (excluding whitespace) # clean_text = ' '.join(text_content.split()) # if len(clean_text) < min_text_length: # return False # # # 4. Check for typical product card elements # has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])) # # return has_title # # # def should_exclude_element(element): # # """ # # Check if an element should be excluded from consideration. # # """ # # # 1. Exclude common non-product sections # exclude_patterns = [ # 'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart', # 'search', 'pagination', 'sort', 'banner', 'ad', 'slider' # ] # # # Check class and id # element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split() # element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split() # # print(element_classes) # # for pattern in exclude_patterns: # if pattern in element_classes: # print(f"Excluded element due to class containing '{pattern}'") # return True # if pattern in element_id: # print(f"Excluded element due to id containing '{pattern}'") # return True # # return False