ai-news-analyzer / src /md_html.py
Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
import os
import re
import markdown
from image_search import search_unsplash_image # Must return (image_url, image_credit)
def convert_md_folder_to_html(md_folder, html_output_folder):
os.makedirs(html_output_folder, exist_ok=True)
for filename in os.listdir(md_folder):
if filename.endswith(".md"):
md_path = os.path.join(md_folder, filename)
title = filename.replace(".md", "")
html_path = os.path.join(html_output_folder, filename.replace(".md", ".html"))
with open(md_path, "r", encoding="utf-8") as f:
md_content = f.read()
md_content = re.sub(r'!\[.*?\]\(.*?\)', '', md_content) # remove Markdown images
html_body = markdown.markdown(md_content, extensions=["extra", "codehilite", "toc"])
html_body = re.sub(r'<p>(\[\d+\](?:,\s*\[\d+\])*)</p>', r'\1', html_body) # inline references
image_url, image_credit = search_unsplash_image(title)
# Extract metrics blockquote and convert to bullet list
metrics_block = ""
if "<blockquote>" in html_body:
start = html_body.find("<blockquote>")
end = html_body.find("</blockquote>") + len("</blockquote>")
metrics_raw = html_body[start:end]
html_body = html_body[:start] + html_body[end:]
text = re.sub(r'<.*?>', '', metrics_raw).strip()
lines = [f"<li>{line.strip()}</li>" for line in text.splitlines() if line.strip()]
metrics_block = f"<ul>{''.join(lines)}</ul>"
html_template = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{title}</title>
<style>
body {{
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
background-color: #f8f9fa;
color: #212529;
line-height: 1.6;
}}
header {{
background-color: #ffffff;
text-align: center;
padding: 1em;
border-bottom: 2px solid #dee2e6;
}}
header img {{
width: 100%;
height: auto;
max-height: 50vh;
object-fit: cover;
}}
.credit {{
font-size: 0.85em;
color: #6c757d;
margin-top: 0.5em;
}}
.container {{
display: flex;
flex-direction: row;
max-width: 1200px;
margin: 2em auto;
padding: 0 1em;
gap: 2em;
}}
main {{
flex: 3;
}}
aside {{
flex: 1;
background-color: #ffffff;
border: 1px solid #dee2e6;
border-radius: 8px;
padding: 1em;
box-shadow: 0 2px 6px rgba(0,0,0,0.05);
height: fit-content;
}}
main img {{
max-width: 100%;
height: auto;
display: block;
margin: 1.5em auto;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
}}
h1, h2, h3 {{
color: #0d6efd;
}}
a {{
color: #0d6efd;
text-decoration: none;
}}
a:hover {{
text-decoration: underline;
}}
code {{
background: #e9ecef;
padding: 0.2em 0.4em;
border-radius: 4px;
font-family: monospace;
}}
pre {{
background: #e9ecef;
padding: 1em;
overflow-x: auto;
border-radius: 6px;
}}
blockquote {{
border-left: 4px solid #0d6efd;
padding-left: 1em;
color: #495057;
margin: 1em 0;
background: #f1f3f5;
}}
</style>
</head>
<body>
<header>
<img src="{image_url}" alt="{title} Banner">
</header>
<div class="container">
<main>
{html_body}
</main>
<aside>
<h3>🧠 Metrics</h3>
{metrics_block}
</aside>
</div>
</body>
</html>
"""
with open(html_path, "w", encoding="utf-8") as f:
f.write(html_template)
print(f"✅ Converted: {md_path} -> {html_path}")
import os
import re
import markdown
from image_search import search_unsplash_image # Must return (image_url, image_credit)
def convert_single_md_to_html(md_path, html_output_folder):
os.makedirs(html_output_folder, exist_ok=True)
filename = os.path.basename(md_path)
title = filename.replace(".md", "")
html_path = os.path.join(html_output_folder, filename.replace(".md", ".html"))
with open(md_path, "r", encoding="utf-8") as f:
md_content = f.read()
md_content = re.sub(r'!\[.*?\]\(.*?\)', '', md_content) # remove Markdown images
html_body = markdown.markdown(md_content, extensions=["extra", "codehilite", "toc"])
html_body = re.sub(r'<p>(\[\d+\](?:,\s*\[\d+\])*)</p>', r'\1', html_body) # inline refs
image_url, image_credit = search_unsplash_image(title)
metrics_block = ""
if "<blockquote>" in html_body:
start = html_body.find("<blockquote>")
end = html_body.find("</blockquote>") + len("</blockquote>")
metrics_raw = html_body[start:end]
html_body = html_body[:start] + html_body[end:]
text = re.sub(r'<.*?>', '', metrics_raw).strip()
lines = [f"<li>{line.strip()}</li>" for line in text.splitlines() if line.strip()]
metrics_block = f"<ul>{''.join(lines)}</ul>"
html_template = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{title}</title>
<style>
body {{
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
background-color: #f8f9fa;
color: #212529;
line-height: 1.6;
}}
header {{
background-color: #ffffff;
text-align: center;
padding: 1em;
border-bottom: 2px solid #dee2e6;
}}
header img {{
width: 100%;
height: auto;
max-height: 50vh;
object-fit: cover;
}}
.credit {{
font-size: 0.85em;
color: #6c757d;
margin-top: 0.5em;
}}
.container {{
display: flex;
flex-direction: row;
max-width: 1200px;
margin: 2em auto;
padding: 0 1em;
gap: 2em;
}}
main {{
flex: 3;
}}
aside {{
flex: 1;
background-color: #ffffff;
border: 1px solid #dee2e6;
border-radius: 8px;
padding: 1em;
box-shadow: 0 2px 6px rgba(0,0,0,0.05);
height: fit-content;
}}
main img {{
max-width: 100%;
height: auto;
display: block;
margin: 1.5em auto;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
}}
h1, h2, h3 {{
color: #0d6efd;
}}
a {{
color: #0d6efd;
text-decoration: none;
}}
a:hover {{
text-decoration: underline;
}}
code {{
background: #e9ecef;
padding: 0.2em 0.4em;
border-radius: 4px;
font-family: monospace;
}}
pre {{
background: #e9ecef;
padding: 1em;
overflow-x: auto;
border-radius: 6px;
}}
blockquote {{
border-left: 4px solid #0d6efd;
padding-left: 1em;
color: #495057;
margin: 1em 0;
background: #f1f3f5;
}}
</style>
</head>
<body>
<header>
<img src="{image_url}" alt="{title} Banner">
</header>
<div class="container">
<main>
{html_body}
</main>
<aside>
<h3>🧠 Metrics</h3>
{metrics_block}
</aside>
</div>
</body>
</html>
"""
with open(html_path, "w", encoding="utf-8") as f:
f.write(html_template)
print(f"✅ Converted: {md_path} -> {html_path}")
###FOR TESTING ONLY
if __name__ == "__main__":
md_path = "/Users/sigridveronica/Desktop/Investing/data/nuclear_energy_2025-06-03.md"
md_folder = "/Users/sigridveronica/Desktop/Investing/data"
html_output_folder = "/Users/sigridveronica/Desktop/Investing/html"
convert_md_folder_to_html(md_folder, html_output_folder)