supunTE's picture
create streamlit app
814b935
raw
history blame
11.1 kB
import base64
import copy
import json
from collections import Counter
from urllib.parse import urljoin
import streamlit as st
from bs4 import BeautifulSoup
def remove_svg_elements(element):
"""
Remove all SVG elements from a BeautifulSoup element.
Returns a copy of the element with SVGs removed.
"""
# Create a copy of the element to avoid modifying the original
element_copy = copy.copy(element)
# Find and remove all SVG elements
if hasattr(element_copy, 'find_all'):
svg_elements = element_copy.find_all('svg')
for svg in svg_elements:
svg.decompose()
return element_copy
def get_element_signature(element):
"""
Create a signature for an element based on its structure.
"""
signature = {
'tag': element.name,
'classes': tuple(sorted(element.get('class', []))),
'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
'has_image': bool(element.find('img')),
'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
'has_link': bool(element.find('a')),
}
return str(signature)
def analyze_children_similarity(element):
"""
Analyze how similar the direct children of an element are.
"""
if not element.contents:
return 0, 0
child_signatures = [
get_element_signature(child)
for child in element.find_all(recursive=False)
if child.name
]
if not child_signatures:
return 0, 0
signature_counts = Counter(child_signatures)
most_common_sig, most_common_count = signature_counts.most_common(1)[0]
similarity_score = most_common_count / len(child_signatures)
return similarity_score, most_common_count
def count_images_in_element(element):
"""
Count all images within an element, including nested ones.
"""
return len(element.find_all('img', recursive=True))
def get_element_identifier(element):
"""
Create a unique identifier for an element including tag and classes.
"""
identifier = element.name
if element.get('class'):
identifier += f" .{' .'.join(element['class'])}"
if element.get('id'):
identifier += f" #{element['id']}"
return identifier
def convert_relative_urls(soup, base_url):
"""
Convert all relative URLs in the soup object to absolute URLs.
"""
for tag in soup.find_all(href=True):
tag['href'] = urljoin(base_url, tag['href'])
for tag in soup.find_all(src=True):
tag['src'] = urljoin(base_url, tag['src'])
for tag in soup.find_all(attrs={'data-src': True}):
tag['data-src'] = urljoin(base_url, tag['data-src'])
return soup
def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
"""
Find elements containing images and return both sorted list and detailed top element info.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Convert relative URLs to absolute if base_url is provided
if base_url:
soup = convert_relative_urls(soup, base_url)
# Collect potential container elements with their scores
elements_with_scores = []
for element in soup.find_all():
if element.name in ['div', 'ul', 'section', 'main']:
similarity_score, similar_children_count = analyze_children_similarity(element)
image_count = count_images_in_element(element)
if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
# Count products (direct children with images)
products_count = len([child for child in element.find_all(recursive=False)
if child.name and child.find('img', recursive=True)])
combined_score = (similarity_score * similar_children_count * image_count)
elements_with_scores.append((element, image_count, combined_score, products_count))
if not elements_with_scores:
return [], {"error": "No elements with images found"}, ""
# Sort by combined score
elements_with_scores.sort(key=lambda x: x[2], reverse=True)
# Process elements for sorted list output
sorted_elements = []
for element, image_count, _, products_count in elements_with_scores:
sorted_elements.append((get_element_identifier(element), image_count, products_count))
# Get top element (one with highest combined score)
top_element = elements_with_scores[0][0]
# Remove SVGs from the top element for HTML output
top_element_no_svg = remove_svg_elements(top_element)
# Separate child elements with images
products = []
for child in top_element_no_svg.find_all(recursive=False):
if child.name: # Skip text nodes
# Remove SVGs from each product
child_no_svg = remove_svg_elements(child)
product_info = {
"html_content": str(child_no_svg),
"images": []
}
# Get all images within this product
for img in child_no_svg.find_all('img', recursive=True):
image_info = {
"src": img.get('src', 'No source'),
"alt": img.get('alt', 'No alt text')
}
product_info["images"].append(image_info)
products.append(product_info)
# Create result dictionary for top element
top_element_info = {
"parent": {
"tag": top_element_no_svg.name,
"identifier": get_element_identifier(top_element_no_svg),
"classes": top_element_no_svg.get('class', []),
"id": top_element_no_svg.get('id', None)
},
"products_count": len(products),
"products": products
}
html_output = str(top_element_no_svg)
return sorted_elements, top_element_info, html_output
def get_download_link(content, filename, content_type="file/json"):
"""Generate a download link for the given content"""
b64 = base64.b64encode(content.encode()).decode()
return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
def main():
st.title("HTML File Analyzer")
st.write("Upload HTML files to analyze their structure and find image-rich elements")
# File uploader allows multiple files
uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
if uploaded_files:
all_results = {}
all_html_outputs = {}
# Analysis parameters
col1, col2 = st.columns(2)
with col1:
min_children = st.slider("Minimum number of similar children", 1, 10, 4)
with col2:
min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
# Generate button
if st.button("Generate Analysis"):
# Show processing message
with st.spinner('Processing files...'):
all_results = {}
all_html_outputs = {}
# Process each file
for uploaded_file in uploaded_files:
st.subheader(f"Analysis for {uploaded_file.name}")
try:
# Read and process the file
html_content = uploaded_file.read().decode('utf-8')
sorted_elements, top_element_info, html_output = find_image_rich_parents(
html_content,
min_children=min_children,
min_similarity=min_similarity
)
# Display results
st.write("Elements containing images:")
for element, img_count, prod_count in sorted_elements:
st.write(f"- {element}: {img_count} images, {prod_count} products")
# Store results
all_results[uploaded_file.name] = top_element_info
all_html_outputs[uploaded_file.name] = html_output
except Exception as e:
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
continue
# Create download buttons if we have results
if all_results:
st.subheader("Download Results")
col1, col2 = st.columns(2)
# JSON download
with col1:
json_str = json.dumps(all_results, indent=2)
st.markdown(get_download_link(json_str, 'analysis_results.json'),
unsafe_allow_html=True)
# HTML download
with col2:
# Combine all HTML outputs with file names as headers
combined_html = """
<!DOCTYPE html>
<html>
<head>
<meta charset='UTF-8'>
<style>
div {
width: auto !important;
height: auto !important;
padding: 0 !important;
margin: 0 !important;
}
img {
width: 300px;
height: 300px;
object-fit: contain;
}
body { font-family: Arial, sans-serif; }
.file-section { margin: 20px 0; }
.file-header {
background: #f0f0f0;
padding: 10px;
margin: 20px 0;
}
</style>
</head>
<body>
"""
for filename, html in all_html_outputs.items():
combined_html += f"""
<div class="file-section">
<h2 class="file-header">{filename}</h2>
{html}
</div>
"""
combined_html += "</body></html>"
st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
unsafe_allow_html=True)
# Success message
st.success("Analysis completed successfully!")
if __name__ == "__main__":
main()