Spaces:
Sleeping
Sleeping
File size: 11,101 Bytes
814b935 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
import base64
import copy
import json
from collections import Counter
from urllib.parse import urljoin
import streamlit as st
from bs4 import BeautifulSoup
def remove_svg_elements(element):
"""
Remove all SVG elements from a BeautifulSoup element.
Returns a copy of the element with SVGs removed.
"""
# Create a copy of the element to avoid modifying the original
element_copy = copy.copy(element)
# Find and remove all SVG elements
if hasattr(element_copy, 'find_all'):
svg_elements = element_copy.find_all('svg')
for svg in svg_elements:
svg.decompose()
return element_copy
def get_element_signature(element):
"""
Create a signature for an element based on its structure.
"""
signature = {
'tag': element.name,
'classes': tuple(sorted(element.get('class', []))),
'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
'has_image': bool(element.find('img')),
'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
'has_link': bool(element.find('a')),
}
return str(signature)
def analyze_children_similarity(element):
"""
Analyze how similar the direct children of an element are.
"""
if not element.contents:
return 0, 0
child_signatures = [
get_element_signature(child)
for child in element.find_all(recursive=False)
if child.name
]
if not child_signatures:
return 0, 0
signature_counts = Counter(child_signatures)
most_common_sig, most_common_count = signature_counts.most_common(1)[0]
similarity_score = most_common_count / len(child_signatures)
return similarity_score, most_common_count
def count_images_in_element(element):
"""
Count all images within an element, including nested ones.
"""
return len(element.find_all('img', recursive=True))
def get_element_identifier(element):
"""
Create a unique identifier for an element including tag and classes.
"""
identifier = element.name
if element.get('class'):
identifier += f" .{' .'.join(element['class'])}"
if element.get('id'):
identifier += f" #{element['id']}"
return identifier
def convert_relative_urls(soup, base_url):
"""
Convert all relative URLs in the soup object to absolute URLs.
"""
for tag in soup.find_all(href=True):
tag['href'] = urljoin(base_url, tag['href'])
for tag in soup.find_all(src=True):
tag['src'] = urljoin(base_url, tag['src'])
for tag in soup.find_all(attrs={'data-src': True}):
tag['data-src'] = urljoin(base_url, tag['data-src'])
return soup
def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
"""
Find elements containing images and return both sorted list and detailed top element info.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Convert relative URLs to absolute if base_url is provided
if base_url:
soup = convert_relative_urls(soup, base_url)
# Collect potential container elements with their scores
elements_with_scores = []
for element in soup.find_all():
if element.name in ['div', 'ul', 'section', 'main']:
similarity_score, similar_children_count = analyze_children_similarity(element)
image_count = count_images_in_element(element)
if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
# Count products (direct children with images)
products_count = len([child for child in element.find_all(recursive=False)
if child.name and child.find('img', recursive=True)])
combined_score = (similarity_score * similar_children_count * image_count)
elements_with_scores.append((element, image_count, combined_score, products_count))
if not elements_with_scores:
return [], {"error": "No elements with images found"}, ""
# Sort by combined score
elements_with_scores.sort(key=lambda x: x[2], reverse=True)
# Process elements for sorted list output
sorted_elements = []
for element, image_count, _, products_count in elements_with_scores:
sorted_elements.append((get_element_identifier(element), image_count, products_count))
# Get top element (one with highest combined score)
top_element = elements_with_scores[0][0]
# Remove SVGs from the top element for HTML output
top_element_no_svg = remove_svg_elements(top_element)
# Separate child elements with images
products = []
for child in top_element_no_svg.find_all(recursive=False):
if child.name: # Skip text nodes
# Remove SVGs from each product
child_no_svg = remove_svg_elements(child)
product_info = {
"html_content": str(child_no_svg),
"images": []
}
# Get all images within this product
for img in child_no_svg.find_all('img', recursive=True):
image_info = {
"src": img.get('src', 'No source'),
"alt": img.get('alt', 'No alt text')
}
product_info["images"].append(image_info)
products.append(product_info)
# Create result dictionary for top element
top_element_info = {
"parent": {
"tag": top_element_no_svg.name,
"identifier": get_element_identifier(top_element_no_svg),
"classes": top_element_no_svg.get('class', []),
"id": top_element_no_svg.get('id', None)
},
"products_count": len(products),
"products": products
}
html_output = str(top_element_no_svg)
return sorted_elements, top_element_info, html_output
def get_download_link(content, filename, content_type="file/json"):
"""Generate a download link for the given content"""
b64 = base64.b64encode(content.encode()).decode()
return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
def main():
st.title("HTML File Analyzer")
st.write("Upload HTML files to analyze their structure and find image-rich elements")
# File uploader allows multiple files
uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
if uploaded_files:
all_results = {}
all_html_outputs = {}
# Analysis parameters
col1, col2 = st.columns(2)
with col1:
min_children = st.slider("Minimum number of similar children", 1, 10, 4)
with col2:
min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
# Generate button
if st.button("Generate Analysis"):
# Show processing message
with st.spinner('Processing files...'):
all_results = {}
all_html_outputs = {}
# Process each file
for uploaded_file in uploaded_files:
st.subheader(f"Analysis for {uploaded_file.name}")
try:
# Read and process the file
html_content = uploaded_file.read().decode('utf-8')
sorted_elements, top_element_info, html_output = find_image_rich_parents(
html_content,
min_children=min_children,
min_similarity=min_similarity
)
# Display results
st.write("Elements containing images:")
for element, img_count, prod_count in sorted_elements:
st.write(f"- {element}: {img_count} images, {prod_count} products")
# Store results
all_results[uploaded_file.name] = top_element_info
all_html_outputs[uploaded_file.name] = html_output
except Exception as e:
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
continue
# Create download buttons if we have results
if all_results:
st.subheader("Download Results")
col1, col2 = st.columns(2)
# JSON download
with col1:
json_str = json.dumps(all_results, indent=2)
st.markdown(get_download_link(json_str, 'analysis_results.json'),
unsafe_allow_html=True)
# HTML download
with col2:
# Combine all HTML outputs with file names as headers
combined_html = """
<!DOCTYPE html>
<html>
<head>
<meta charset='UTF-8'>
<style>
div {
width: auto !important;
height: auto !important;
padding: 0 !important;
margin: 0 !important;
}
img {
width: 300px;
height: 300px;
object-fit: contain;
}
body { font-family: Arial, sans-serif; }
.file-section { margin: 20px 0; }
.file-header {
background: #f0f0f0;
padding: 10px;
margin: 20px 0;
}
</style>
</head>
<body>
"""
for filename, html in all_html_outputs.items():
combined_html += f"""
<div class="file-section">
<h2 class="file-header">{filename}</h2>
{html}
</div>
"""
combined_html += "</body></html>"
st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
unsafe_allow_html=True)
# Success message
st.success("Analysis completed successfully!")
if __name__ == "__main__":
main() |