Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 30 days ago

Commit

1eab466

verified ·

1 Parent(s): 2a4cb91

Upload 3 files

Browse files

Files changed (3) hide show

README (1).md +14 -0
app (2).py +267 -0
requirements.txt +7 -0

README (1).md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Create Llms Txt
+emoji: 📈
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 5.20.0
+app_file: app.py
+pinned: true
+license: mit
+short_description: This is a simple app to create a llms.txt file of your site.
+---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

app (2).py ADDED Viewed

	@@ -0,0 +1,267 @@

+import gradio as gr
+import advertools as adv
+import pandas as pd
+import re
+from secrets import token_hex
+import logging
+import os
+from markitdown import MarkItDown
+from typing import Tuple, List, Optional
+import validators
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Initialize MarkItDown
+md_converter = MarkItDown()
+def validate_url(url: str) -> Tuple[bool, str]:
+    """Validate URL format and accessibility."""
+    if not url:
+        return False, "URL is required"
+    if not url.startswith(("http://", "https://")):
+        url = "https://" + url
+    if not validators.url(url):
+        return False, "Invalid URL format"
+    return True, url
+def safe_crawl(url: str, output_file: str) -> bool:
+    """Safely perform a web crawl with timeout and error handling."""
+    try:
+        adv.crawl(
+            url,
+            output_file,
+            follow_links=False,
+            custom_settings={
+                "CLOSESPIDER_TIMEOUT": 30,
+                "ROBOTSTXT_OBEY": True,
+                "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
+                "USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)",
+                "DOWNLOAD_TIMEOUT": 10,
+            },
+        )
+        return True
+    except Exception as e:
+        logger.error(f"Crawl error for {url}: {str(e)}")
+        return False
+def clean_text(text: str) -> str:
+    """Clean and format text by removing extra whitespace and normalizing spacing."""
+    if not text:
+        return ""
+    # Remove extra whitespace and newlines
+    text = re.sub(r"[\n\s]+", " ", text)
+    # Split camelCase words
+    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
+    # Clean extra spaces
+    text = " ".join(text.split())
+    return text.strip()
+def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]:
+    """Process a single link-text pair and return markdown if valid."""
+    if not url or not text:
+        return None
+    url = url.strip()
+    text = clean_text(text)
+    if not text or not url or url in seen_links:
+        return None
+    seen_links.add(url)
+    return f"## {text}\n[{text}]({url})"
+def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
+    """Process links based on selected types with deduplication."""
+    try:
+        all_links = []
+        seen_links = set()  # Track unique URLs
+        if "All links" in link_types or not link_types:
+            link_df = adv.crawlytics.links(crawl_df)
+            for link, text in link_df[["link", "text"]].dropna().values:
+                if md_link := process_link_pair(link, text, seen_links):
+                    all_links.append(md_link)
+        else:
+            for link_type in link_types:
+                type_match = re.findall(r"header|footer|nav", link_type.lower())
+                if type_match:
+                    col_prefix = type_match[0]
+                    urls = crawl_df[f"{col_prefix}_links_url"].iloc[0]
+                    texts = crawl_df[f"{col_prefix}_links_text"].iloc[0]
+                    if urls and texts:
+                        urls = urls.split("@@")
+                        texts = texts.split("@@")
+                        for url, text in zip(urls, texts):
+                            if md_link := process_link_pair(url, text, seen_links):
+                                all_links.append(md_link)
+        return "\n\n".join(all_links)
+    except Exception as e:
+        logger.error(f"Link processing error: {str(e)}")
+        return ""
+def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
+    """Process website URL and generate markdown content."""
+    valid, result = validate_url(url)
+    if not valid:
+        return "", result
+    url = result
+    output_file = f"crawl_{token_hex(6)}.jsonl"
+    try:
+        if not safe_crawl(url, output_file):
+            return "", "Crawl failed or timed out"
+        crawl_df = pd.read_json(output_file, lines=True)
+        if crawl_df.empty:
+            return "", "No data found for the URL"
+        # Extract and clean title and description
+        title = (
+            clean_text(crawl_df["title"].iloc[0])
+            if "title" in crawl_df.columns
+            else "Untitled"
+        )
+        meta_desc = (
+            clean_text(crawl_df["meta_desc"].iloc[0])
+            if "meta_desc" in crawl_df.columns
+            else ""
+        )
+        # Process links
+        links_content = process_links(crawl_df, link_types)
+        # Generate final markdown
+        content = f"# {title}\n\n"
+        if meta_desc:
+            content += f"> {meta_desc}\n\n"
+        content += links_content
+        return content, f"Successfully processed {url}"
+    except Exception as e:
+        logger.error(f"Error processing {url}: {str(e)}")
+        return "", f"Error: {str(e)}"
+    finally:
+        if os.path.exists(output_file):
+            os.remove(output_file)
+def process_file(file: gr.File) -> Tuple[str, str]:
+    """Convert uploaded file to markdown."""
+    if not file:
+        return "", "No file uploaded"
+    supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"}
+    file_ext = os.path.splitext(file.name)[1].lower()
+    if file_ext not in supported_extensions:
+        return "", f"Unsupported file type: {file_ext}"
+    try:
+        result = md_converter.convert(file.name)
+        return result.text_content, "File processed successfully"
+    except Exception as e:
+        logger.error(f"File processing error: {str(e)}")
+        return "", f"Error processing file: {str(e)}"
+# Custom CSS for styling
+css = """
+@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
+body {
+    font-family: 'Open Sans', sans-serif !important;
+}
+.primary-btn {
+    background-color: #3452db !important;
+}
+.primary-btn:hover {
+    background-color: #2a41af !important;
+}
+"""
+# Create a custom theme
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.colors.Color(
+        name="blue",
+        c50="#eef1ff",
+        c100="#e0e5ff",
+        c200="#c3cbff",
+        c300="#a5b2ff",
+        c400="#8798ff",
+        c500="#6a7eff",
+        c600="#3452db",
+        c700="#2a41af",
+        c800="#1f3183",
+        c900="#152156",
+        c950="#0a102b",
+    )
+)
+# Create interface
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    css=css,
+    head="""
+        <link rel="canonical" href="https://wordlift.io/generate-llms-txt/" />
+        <meta name="description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
+        <meta property="og:title" content="LLMs.txt Generator by WordLift" />
+        <meta property="og:description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
+        <meta property="og:url" content="https://wordlift.io/generate-llms-txt/" />
+    """,
+) as iface:
+    gr.Markdown("# LLMs.txt Generator")
+    with gr.Tab("Website URL"):
+        url_input = gr.Textbox(label="Website URL", placeholder="example.com")
+        link_types = gr.Dropdown(
+            choices=["All links", "<header> links", "<nav> links", "<footer> links"],
+            multiselect=True,
+            value=["All links"],
+            label="Link Types to Extract",
+        )
+        url_button = gr.Button("Process URL", variant="primary")
+        url_output = gr.Textbox(
+            label="Generated Content", lines=20, show_copy_button=True
+        )
+        url_status = gr.Textbox(label="Status")
+        url_button.click(
+            process_url,
+            inputs=[url_input, link_types],
+            outputs=[url_output, url_status],
+        )
+    with gr.Tab("File Converter"):
+        file_input = gr.File(label="Upload Document")
+        file_button = gr.Button("Convert to Markdown", variant="primary")
+        file_output = gr.Textbox(
+            label="Converted Content", lines=20, show_copy_button=True
+        )
+        file_status = gr.Textbox(label="Status")
+        file_button.click(
+            process_file, inputs=[file_input], outputs=[file_output, file_status]
+        )
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy>=1.23.5
+pandas>=1.5.0
+scipy>=1.10.0
+advertools
+markitdown
+validators
+gradio