Spaces:
Running
Running
Upload 3 files
Browse files- README (1).md +14 -0
- app (2).py +267 -0
- requirements.txt +7 -0
README (1).md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Create Llms Txt
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.20.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: mit
|
11 |
+
short_description: This is a simple app to create a llms.txt file of your site.
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
|
app (2).py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import advertools as adv
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
from secrets import token_hex
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
from markitdown import MarkItDown
|
9 |
+
from typing import Tuple, List, Optional
|
10 |
+
import validators
|
11 |
+
|
12 |
+
# Set up logging
|
13 |
+
logging.basicConfig(
|
14 |
+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
15 |
+
)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
# Initialize MarkItDown
|
19 |
+
md_converter = MarkItDown()
|
20 |
+
|
21 |
+
|
22 |
+
def validate_url(url: str) -> Tuple[bool, str]:
|
23 |
+
"""Validate URL format and accessibility."""
|
24 |
+
if not url:
|
25 |
+
return False, "URL is required"
|
26 |
+
|
27 |
+
if not url.startswith(("http://", "https://")):
|
28 |
+
url = "https://" + url
|
29 |
+
|
30 |
+
if not validators.url(url):
|
31 |
+
return False, "Invalid URL format"
|
32 |
+
|
33 |
+
return True, url
|
34 |
+
|
35 |
+
|
36 |
+
def safe_crawl(url: str, output_file: str) -> bool:
|
37 |
+
"""Safely perform a web crawl with timeout and error handling."""
|
38 |
+
try:
|
39 |
+
adv.crawl(
|
40 |
+
url,
|
41 |
+
output_file,
|
42 |
+
follow_links=False,
|
43 |
+
custom_settings={
|
44 |
+
"CLOSESPIDER_TIMEOUT": 30,
|
45 |
+
"ROBOTSTXT_OBEY": True,
|
46 |
+
"CONCURRENT_REQUESTS_PER_DOMAIN": 1,
|
47 |
+
"USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)",
|
48 |
+
"DOWNLOAD_TIMEOUT": 10,
|
49 |
+
},
|
50 |
+
)
|
51 |
+
return True
|
52 |
+
except Exception as e:
|
53 |
+
logger.error(f"Crawl error for {url}: {str(e)}")
|
54 |
+
return False
|
55 |
+
|
56 |
+
|
57 |
+
def clean_text(text: str) -> str:
|
58 |
+
"""Clean and format text by removing extra whitespace and normalizing spacing."""
|
59 |
+
if not text:
|
60 |
+
return ""
|
61 |
+
# Remove extra whitespace and newlines
|
62 |
+
text = re.sub(r"[\n\s]+", " ", text)
|
63 |
+
# Split camelCase words
|
64 |
+
text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
|
65 |
+
# Clean extra spaces
|
66 |
+
text = " ".join(text.split())
|
67 |
+
return text.strip()
|
68 |
+
|
69 |
+
|
70 |
+
def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]:
|
71 |
+
"""Process a single link-text pair and return markdown if valid."""
|
72 |
+
if not url or not text:
|
73 |
+
return None
|
74 |
+
|
75 |
+
url = url.strip()
|
76 |
+
text = clean_text(text)
|
77 |
+
|
78 |
+
if not text or not url or url in seen_links:
|
79 |
+
return None
|
80 |
+
|
81 |
+
seen_links.add(url)
|
82 |
+
return f"## {text}\n[{text}]({url})"
|
83 |
+
|
84 |
+
|
85 |
+
def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
|
86 |
+
"""Process links based on selected types with deduplication."""
|
87 |
+
try:
|
88 |
+
all_links = []
|
89 |
+
seen_links = set() # Track unique URLs
|
90 |
+
|
91 |
+
if "All links" in link_types or not link_types:
|
92 |
+
link_df = adv.crawlytics.links(crawl_df)
|
93 |
+
for link, text in link_df[["link", "text"]].dropna().values:
|
94 |
+
if md_link := process_link_pair(link, text, seen_links):
|
95 |
+
all_links.append(md_link)
|
96 |
+
else:
|
97 |
+
for link_type in link_types:
|
98 |
+
type_match = re.findall(r"header|footer|nav", link_type.lower())
|
99 |
+
if type_match:
|
100 |
+
col_prefix = type_match[0]
|
101 |
+
urls = crawl_df[f"{col_prefix}_links_url"].iloc[0]
|
102 |
+
texts = crawl_df[f"{col_prefix}_links_text"].iloc[0]
|
103 |
+
|
104 |
+
if urls and texts:
|
105 |
+
urls = urls.split("@@")
|
106 |
+
texts = texts.split("@@")
|
107 |
+
|
108 |
+
for url, text in zip(urls, texts):
|
109 |
+
if md_link := process_link_pair(url, text, seen_links):
|
110 |
+
all_links.append(md_link)
|
111 |
+
|
112 |
+
return "\n\n".join(all_links)
|
113 |
+
except Exception as e:
|
114 |
+
logger.error(f"Link processing error: {str(e)}")
|
115 |
+
return ""
|
116 |
+
|
117 |
+
|
118 |
+
def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
|
119 |
+
"""Process website URL and generate markdown content."""
|
120 |
+
valid, result = validate_url(url)
|
121 |
+
if not valid:
|
122 |
+
return "", result
|
123 |
+
|
124 |
+
url = result
|
125 |
+
output_file = f"crawl_{token_hex(6)}.jsonl"
|
126 |
+
|
127 |
+
try:
|
128 |
+
if not safe_crawl(url, output_file):
|
129 |
+
return "", "Crawl failed or timed out"
|
130 |
+
|
131 |
+
crawl_df = pd.read_json(output_file, lines=True)
|
132 |
+
if crawl_df.empty:
|
133 |
+
return "", "No data found for the URL"
|
134 |
+
|
135 |
+
# Extract and clean title and description
|
136 |
+
title = (
|
137 |
+
clean_text(crawl_df["title"].iloc[0])
|
138 |
+
if "title" in crawl_df.columns
|
139 |
+
else "Untitled"
|
140 |
+
)
|
141 |
+
meta_desc = (
|
142 |
+
clean_text(crawl_df["meta_desc"].iloc[0])
|
143 |
+
if "meta_desc" in crawl_df.columns
|
144 |
+
else ""
|
145 |
+
)
|
146 |
+
|
147 |
+
# Process links
|
148 |
+
links_content = process_links(crawl_df, link_types)
|
149 |
+
|
150 |
+
# Generate final markdown
|
151 |
+
content = f"# {title}\n\n"
|
152 |
+
if meta_desc:
|
153 |
+
content += f"> {meta_desc}\n\n"
|
154 |
+
content += links_content
|
155 |
+
|
156 |
+
return content, f"Successfully processed {url}"
|
157 |
+
|
158 |
+
except Exception as e:
|
159 |
+
logger.error(f"Error processing {url}: {str(e)}")
|
160 |
+
return "", f"Error: {str(e)}"
|
161 |
+
finally:
|
162 |
+
if os.path.exists(output_file):
|
163 |
+
os.remove(output_file)
|
164 |
+
|
165 |
+
|
166 |
+
def process_file(file: gr.File) -> Tuple[str, str]:
|
167 |
+
"""Convert uploaded file to markdown."""
|
168 |
+
if not file:
|
169 |
+
return "", "No file uploaded"
|
170 |
+
|
171 |
+
supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"}
|
172 |
+
file_ext = os.path.splitext(file.name)[1].lower()
|
173 |
+
|
174 |
+
if file_ext not in supported_extensions:
|
175 |
+
return "", f"Unsupported file type: {file_ext}"
|
176 |
+
|
177 |
+
try:
|
178 |
+
result = md_converter.convert(file.name)
|
179 |
+
return result.text_content, "File processed successfully"
|
180 |
+
except Exception as e:
|
181 |
+
logger.error(f"File processing error: {str(e)}")
|
182 |
+
return "", f"Error processing file: {str(e)}"
|
183 |
+
|
184 |
+
|
185 |
+
# Custom CSS for styling
|
186 |
+
css = """
|
187 |
+
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
|
188 |
+
|
189 |
+
body {
|
190 |
+
font-family: 'Open Sans', sans-serif !important;
|
191 |
+
}
|
192 |
+
|
193 |
+
.primary-btn {
|
194 |
+
background-color: #3452db !important;
|
195 |
+
}
|
196 |
+
|
197 |
+
.primary-btn:hover {
|
198 |
+
background-color: #2a41af !important;
|
199 |
+
}
|
200 |
+
"""
|
201 |
+
|
202 |
+
# Create a custom theme
|
203 |
+
theme = gr.themes.Soft(
|
204 |
+
primary_hue=gr.themes.colors.Color(
|
205 |
+
name="blue",
|
206 |
+
c50="#eef1ff",
|
207 |
+
c100="#e0e5ff",
|
208 |
+
c200="#c3cbff",
|
209 |
+
c300="#a5b2ff",
|
210 |
+
c400="#8798ff",
|
211 |
+
c500="#6a7eff",
|
212 |
+
c600="#3452db",
|
213 |
+
c700="#2a41af",
|
214 |
+
c800="#1f3183",
|
215 |
+
c900="#152156",
|
216 |
+
c950="#0a102b",
|
217 |
+
)
|
218 |
+
)
|
219 |
+
|
220 |
+
# Create interface
|
221 |
+
with gr.Blocks(
|
222 |
+
theme=gr.themes.Soft(),
|
223 |
+
css=css,
|
224 |
+
head="""
|
225 |
+
<link rel="canonical" href="https://wordlift.io/generate-llms-txt/" />
|
226 |
+
<meta name="description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
|
227 |
+
<meta property="og:title" content="LLMs.txt Generator by WordLift" />
|
228 |
+
<meta property="og:description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
|
229 |
+
<meta property="og:url" content="https://wordlift.io/generate-llms-txt/" />
|
230 |
+
""",
|
231 |
+
) as iface:
|
232 |
+
gr.Markdown("# LLMs.txt Generator")
|
233 |
+
|
234 |
+
with gr.Tab("Website URL"):
|
235 |
+
url_input = gr.Textbox(label="Website URL", placeholder="example.com")
|
236 |
+
link_types = gr.Dropdown(
|
237 |
+
choices=["All links", "<header> links", "<nav> links", "<footer> links"],
|
238 |
+
multiselect=True,
|
239 |
+
value=["All links"],
|
240 |
+
label="Link Types to Extract",
|
241 |
+
)
|
242 |
+
url_button = gr.Button("Process URL", variant="primary")
|
243 |
+
url_output = gr.Textbox(
|
244 |
+
label="Generated Content", lines=20, show_copy_button=True
|
245 |
+
)
|
246 |
+
url_status = gr.Textbox(label="Status")
|
247 |
+
|
248 |
+
url_button.click(
|
249 |
+
process_url,
|
250 |
+
inputs=[url_input, link_types],
|
251 |
+
outputs=[url_output, url_status],
|
252 |
+
)
|
253 |
+
|
254 |
+
with gr.Tab("File Converter"):
|
255 |
+
file_input = gr.File(label="Upload Document")
|
256 |
+
file_button = gr.Button("Convert to Markdown", variant="primary")
|
257 |
+
file_output = gr.Textbox(
|
258 |
+
label="Converted Content", lines=20, show_copy_button=True
|
259 |
+
)
|
260 |
+
file_status = gr.Textbox(label="Status")
|
261 |
+
|
262 |
+
file_button.click(
|
263 |
+
process_file, inputs=[file_input], outputs=[file_output, file_status]
|
264 |
+
)
|
265 |
+
|
266 |
+
if __name__ == "__main__":
|
267 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy>=1.23.5
|
2 |
+
pandas>=1.5.0
|
3 |
+
scipy>=1.10.0
|
4 |
+
advertools
|
5 |
+
markitdown
|
6 |
+
validators
|
7 |
+
gradio
|