File size: 11,101 Bytes
814b935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import base64
import copy
import json
from collections import Counter
from urllib.parse import urljoin

import streamlit as st
from bs4 import BeautifulSoup


def remove_svg_elements(element):
    """
    Remove all SVG elements from a BeautifulSoup element.
    Returns a copy of the element with SVGs removed.
    """
    # Create a copy of the element to avoid modifying the original
    element_copy = copy.copy(element)

    # Find and remove all SVG elements
    if hasattr(element_copy, 'find_all'):
        svg_elements = element_copy.find_all('svg')
        for svg in svg_elements:
            svg.decompose()

    return element_copy

def get_element_signature(element):
    """
    Create a signature for an element based on its structure.
    """
    signature = {
        'tag': element.name,
        'classes': tuple(sorted(element.get('class', []))),
        'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
        'has_image': bool(element.find('img')),
        'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
        'has_link': bool(element.find('a')),
    }
    return str(signature)

def analyze_children_similarity(element):
    """
    Analyze how similar the direct children of an element are.
    """
    if not element.contents:
        return 0, 0

    child_signatures = [
        get_element_signature(child)
        for child in element.find_all(recursive=False)
        if child.name
    ]

    if not child_signatures:
        return 0, 0

    signature_counts = Counter(child_signatures)
    most_common_sig, most_common_count = signature_counts.most_common(1)[0]
    similarity_score = most_common_count / len(child_signatures)

    return similarity_score, most_common_count

def count_images_in_element(element):
    """
    Count all images within an element, including nested ones.
    """
    return len(element.find_all('img', recursive=True))

def get_element_identifier(element):
    """
    Create a unique identifier for an element including tag and classes.
    """
    identifier = element.name
    if element.get('class'):
        identifier += f" .{' .'.join(element['class'])}"
    if element.get('id'):
        identifier += f" #{element['id']}"
    return identifier

def convert_relative_urls(soup, base_url):
    """
    Convert all relative URLs in the soup object to absolute URLs.
    """
    for tag in soup.find_all(href=True):
        tag['href'] = urljoin(base_url, tag['href'])
    for tag in soup.find_all(src=True):
        tag['src'] = urljoin(base_url, tag['src'])
    for tag in soup.find_all(attrs={'data-src': True}):
        tag['data-src'] = urljoin(base_url, tag['data-src'])
    return soup

def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
    """
    Find elements containing images and return both sorted list and detailed top element info.
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Convert relative URLs to absolute if base_url is provided
    if base_url:
        soup = convert_relative_urls(soup, base_url)

    # Collect potential container elements with their scores
    elements_with_scores = []
    for element in soup.find_all():
        if element.name in ['div', 'ul', 'section', 'main']:
            similarity_score, similar_children_count = analyze_children_similarity(element)
            image_count = count_images_in_element(element)

            if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
                # Count products (direct children with images)
                products_count = len([child for child in element.find_all(recursive=False)
                                      if child.name and child.find('img', recursive=True)])

                combined_score = (similarity_score * similar_children_count * image_count)
                elements_with_scores.append((element, image_count, combined_score, products_count))

    if not elements_with_scores:
        return [], {"error": "No elements with images found"}, ""

    # Sort by combined score
    elements_with_scores.sort(key=lambda x: x[2], reverse=True)

    # Process elements for sorted list output
    sorted_elements = []
    for element, image_count, _, products_count in elements_with_scores:
        sorted_elements.append((get_element_identifier(element), image_count, products_count))

    # Get top element (one with highest combined score)
    top_element = elements_with_scores[0][0]

    # Remove SVGs from the top element for HTML output
    top_element_no_svg = remove_svg_elements(top_element)

    # Separate child elements with images
    products = []
    for child in top_element_no_svg.find_all(recursive=False):
        if child.name:  # Skip text nodes
            # Remove SVGs from each product
            child_no_svg = remove_svg_elements(child)
            product_info = {
                "html_content": str(child_no_svg),
                "images": []
            }

            # Get all images within this product
            for img in child_no_svg.find_all('img', recursive=True):
                image_info = {
                    "src": img.get('src', 'No source'),
                    "alt": img.get('alt', 'No alt text')
                }
                product_info["images"].append(image_info)

            products.append(product_info)

    # Create result dictionary for top element
    top_element_info = {
        "parent": {
            "tag": top_element_no_svg.name,
            "identifier": get_element_identifier(top_element_no_svg),
            "classes": top_element_no_svg.get('class', []),
            "id": top_element_no_svg.get('id', None)
        },
        "products_count": len(products),
        "products": products
    }

    html_output = str(top_element_no_svg)

    return sorted_elements, top_element_info, html_output

def get_download_link(content, filename, content_type="file/json"):
    """Generate a download link for the given content"""
    b64 = base64.b64encode(content.encode()).decode()
    return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'

def main():
    st.title("HTML File Analyzer")
    st.write("Upload HTML files to analyze their structure and find image-rich elements")

    # File uploader allows multiple files
    uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])

    if uploaded_files:
        all_results = {}
        all_html_outputs = {}

        # Analysis parameters
        col1, col2 = st.columns(2)
        with col1:
            min_children = st.slider("Minimum number of similar children", 1, 10, 4)
        with col2:
            min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)

        # Generate button
        if st.button("Generate Analysis"):
            # Show processing message
            with st.spinner('Processing files...'):
                all_results = {}
                all_html_outputs = {}

                # Process each file
                for uploaded_file in uploaded_files:
                    st.subheader(f"Analysis for {uploaded_file.name}")

                    try:
                        # Read and process the file
                        html_content = uploaded_file.read().decode('utf-8')
                        sorted_elements, top_element_info, html_output = find_image_rich_parents(
                            html_content,
                            min_children=min_children,
                            min_similarity=min_similarity
                        )

                        # Display results
                        st.write("Elements containing images:")
                        for element, img_count, prod_count in sorted_elements:
                            st.write(f"- {element}: {img_count} images, {prod_count} products")

                        # Store results
                        all_results[uploaded_file.name] = top_element_info
                        all_html_outputs[uploaded_file.name] = html_output

                    except Exception as e:
                        st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                        continue

                # Create download buttons if we have results
                if all_results:
                    st.subheader("Download Results")
                    col1, col2 = st.columns(2)

                    # JSON download
                    with col1:
                        json_str = json.dumps(all_results, indent=2)
                        st.markdown(get_download_link(json_str, 'analysis_results.json'),
                                    unsafe_allow_html=True)

                    # HTML download
                    with col2:
                        # Combine all HTML outputs with file names as headers
                        combined_html = """
                            <!DOCTYPE html>
                            <html>
                            <head>
                                <meta charset='UTF-8'>
                                <style>
                                    div {
                                        width: auto !important;
                                        height: auto !important;
                                        padding: 0 !important;
                                        margin: 0 !important;
                                    }
                                    img {
                                        width: 300px;
                                        height: 300px;
                                        object-fit: contain;
                                    }
                                    body { font-family: Arial, sans-serif; }
                                    .file-section { margin: 20px 0; }
                                    .file-header { 
                                        background: #f0f0f0; 
                                        padding: 10px; 
                                        margin: 20px 0;
                                    }       
                                </style>
                            </head>
                            <body>
                            """
                        for filename, html in all_html_outputs.items():
                            combined_html += f"""
                                <div class="file-section">
                                    <h2 class="file-header">{filename}</h2>
                                    {html}
                                </div>
                                """
                        combined_html += "</body></html>"

                        st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
                                    unsafe_allow_html=True)

                    # Success message
                    st.success("Analysis completed successfully!")


if __name__ == "__main__":
    main()