Spaces:

supunTE
/

products-extracter

Sleeping

File size: 11,101 Bytes

814b935

import base64
import copy
import json
from collections import Counter
from urllib.parse import urljoin

import streamlit as st
from bs4 import BeautifulSoup


def remove_svg_elements(element):
    """
    Remove all SVG elements from a BeautifulSoup element.
    Returns a copy of the element with SVGs removed.
    """
    # Create a copy of the element to avoid modifying the original
    element_copy = copy.copy(element)

    # Find and remove all SVG elements
    if hasattr(element_copy, 'find_all'):
        svg_elements = element_copy.find_all('svg')
        for svg in svg_elements:
            svg.decompose()

    return element_copy

def get_element_signature(element):
    """
    Create a signature for an element based on its structure.
    """
    signature = {
        'tag': element.name,
        'classes': tuple(sorted(element.get('class', []))),
        'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
        'has_image': bool(element.find('img')),
        'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
        'has_link': bool(element.find('a')),
    }
    return str(signature)

def analyze_children_similarity(element):
    """
    Analyze how similar the direct children of an element are.
    """
    if not element.contents:
        return 0, 0

    child_signatures = [
        get_element_signature(child)
        for child in element.find_all(recursive=False)
        if child.name
    ]

    if not child_signatures:
        return 0, 0

    signature_counts = Counter(child_signatures)
    most_common_sig, most_common_count = signature_counts.most_common(1)[0]
    similarity_score = most_common_count / len(child_signatures)

    return similarity_score, most_common_count

def count_images_in_element(element):
    """
    Count all images within an element, including nested ones.
    """
    return len(element.find_all('img', recursive=True))

def get_element_identifier(element):
    """
    Create a unique identifier for an element including tag and classes.
    """
    identifier = element.name
    if element.get('class'):
        identifier += f" .{' .'.join(element['class'])}"
    if element.get('id'):
        identifier += f" #{element['id']}"
    return identifier

def convert_relative_urls(soup, base_url):
    """
    Convert all relative URLs in the soup object to absolute URLs.
    """
    for tag in soup.find_all(href=True):
        tag['href'] = urljoin(base_url, tag['href'])
    for tag in soup.find_all(src=True):
        tag['src'] = urljoin(base_url, tag['src'])
    for tag in soup.find_all(attrs={'data-src': True}):
        tag['data-src'] = urljoin(base_url, tag['data-src'])
    return soup

def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
    """
    Find elements containing images and return both sorted list and detailed top element info.
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Convert relative URLs to absolute if base_url is provided
    if base_url:
        soup = convert_relative_urls(soup, base_url)

    # Collect potential container elements with their scores
    elements_with_scores = []
    for element in soup.find_all():
        if element.name in ['div', 'ul', 'section', 'main']:
            similarity_score, similar_children_count = analyze_children_similarity(element)
            image_count = count_images_in_element(element)

            if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
                # Count products (direct children with images)
                products_count = len([child for child in element.find_all(recursive=False)
                                      if child.name and child.find('img', recursive=True)])

                combined_score = (similarity_score * similar_children_count * image_count)
                elements_with_scores.append((element, image_count, combined_score, products_count))

    if not elements_with_scores:
        return [], {"error": "No elements with images found"}, ""

    # Sort by combined score
    elements_with_scores.sort(key=lambda x: x[2], reverse=True)

    # Process elements for sorted list output
    sorted_elements = []
    for element, image_count, _, products_count in elements_with_scores:
        sorted_elements.append((get_element_identifier(element), image_count, products_count))

    # Get top element (one with highest combined score)
    top_element = elements_with_scores[0][0]

    # Remove SVGs from the top element for HTML output
    top_element_no_svg = remove_svg_elements(top_element)

    # Separate child elements with images
    products = []
    for child in top_element_no_svg.find_all(recursive=False):
        if child.name:  # Skip text nodes
            # Remove SVGs from each product
            child_no_svg = remove_svg_elements(child)
            product_info = {
                "html_content": str(child_no_svg),
                "images": []
            }

            # Get all images within this product
            for img in child_no_svg.find_all('img', recursive=True):
                image_info = {
                    "src": img.get('src', 'No source'),
                    "alt": img.get('alt', 'No alt text')
                }
                product_info["images"].append(image_info)

            products.append(product_info)

    # Create result dictionary for top element
    top_element_info = {
        "parent": {
            "tag": top_element_no_svg.name,
            "identifier": get_element_identifier(top_element_no_svg),
            "classes": top_element_no_svg.get('class', []),
            "id": top_element_no_svg.get('id', None)
        },
        "products_count": len(products),
        "products": products
    }

    html_output = str(top_element_no_svg)

    return sorted_elements, top_element_info, html_output

def get_download_link(content, filename, content_type="file/json"):
    """Generate a download link for the given content"""
    b64 = base64.b64encode(content.encode()).decode()
    return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'

def main():
    st.title("HTML File Analyzer")
    st.write("Upload HTML files to analyze their structure and find image-rich elements")

    # File uploader allows multiple files
    uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])

    if uploaded_files:
        all_results = {}
        all_html_outputs = {}

        # Analysis parameters
        col1, col2 = st.columns(2)
        with col1:
            min_children = st.slider("Minimum number of similar children", 1, 10, 4)
        with col2:
            min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)

        # Generate button
        if st.button("Generate Analysis"):
            # Show processing message
            with st.spinner('Processing files...'):
                all_results = {}
                all_html_outputs = {}

                # Process each file
                for uploaded_file in uploaded_files:
                    st.subheader(f"Analysis for {uploaded_file.name}")

                    try:
                        # Read and process the file
                        html_content = uploaded_file.read().decode('utf-8')
                        sorted_elements, top_element_info, html_output = find_image_rich_parents(
                            html_content,
                            min_children=min_children,
                            min_similarity=min_similarity
                        )

                        # Display results
                        st.write("Elements containing images:")
                        for element, img_count, prod_count in sorted_elements:
                            st.write(f"- {element}: {img_count} images, {prod_count} products")

                        # Store results
                        all_results[uploaded_file.name] = top_element_info
                        all_html_outputs[uploaded_file.name] = html_output

                    except Exception as e:
                        st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                        continue

                # Create download buttons if we have results
                if all_results:
                    st.subheader("Download Results")
                    col1, col2 = st.columns(2)

                    # JSON download
                    with col1:
                        json_str = json.dumps(all_results, indent=2)
                        st.markdown(get_download_link(json_str, 'analysis_results.json'),
                                    unsafe_allow_html=True)

                    # HTML download
                    with col2:
                        # Combine all HTML outputs with file names as headers
                        combined_html = """
                            <!DOCTYPE html>
                            <html>
                            <head>
                                <meta charset='UTF-8'>
                                <style>
                                    div {
                                        width: auto !important;
                                        height: auto !important;
                                        padding: 0 !important;
                                        margin: 0 !important;
                                    }
                                    img {
                                        width: 300px;
                                        height: 300px;
                                        object-fit: contain;
                                    }
                                    body { font-family: Arial, sans-serif; }
                                    .file-section { margin: 20px 0; }
                                    .file-header { 
                                        background: #f0f0f0; 
                                        padding: 10px; 
                                        margin: 20px 0;
                                    }       
                                </style>
                            </head>
                            <body>
                            """
                        for filename, html in all_html_outputs.items():
                            combined_html += f"""
                                <div class="file-section">
                                    <h2 class="file-header">{filename}</h2>
                                    {html}
                                </div>
                                """
                        combined_html += "</body></html>"

                        st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
                                    unsafe_allow_html=True)

                    # Success message
                    st.success("Analysis completed successfully!")


if __name__ == "__main__":
    main()