cyberandy commited on
Commit
1eab466
·
verified ·
1 Parent(s): 2a4cb91

Upload 3 files

Browse files
Files changed (3) hide show
  1. README (1).md +14 -0
  2. app (2).py +267 -0
  3. requirements.txt +7 -0
README (1).md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Create Llms Txt
3
+ emoji: 📈
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.20.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: mit
11
+ short_description: This is a simple app to create a llms.txt file of your site.
12
+ ---
13
+
14
+ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
app (2).py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import advertools as adv
3
+ import pandas as pd
4
+ import re
5
+ from secrets import token_hex
6
+ import logging
7
+ import os
8
+ from markitdown import MarkItDown
9
+ from typing import Tuple, List, Optional
10
+ import validators
11
+
12
+ # Set up logging
13
+ logging.basicConfig(
14
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Initialize MarkItDown
19
+ md_converter = MarkItDown()
20
+
21
+
22
+ def validate_url(url: str) -> Tuple[bool, str]:
23
+ """Validate URL format and accessibility."""
24
+ if not url:
25
+ return False, "URL is required"
26
+
27
+ if not url.startswith(("http://", "https://")):
28
+ url = "https://" + url
29
+
30
+ if not validators.url(url):
31
+ return False, "Invalid URL format"
32
+
33
+ return True, url
34
+
35
+
36
+ def safe_crawl(url: str, output_file: str) -> bool:
37
+ """Safely perform a web crawl with timeout and error handling."""
38
+ try:
39
+ adv.crawl(
40
+ url,
41
+ output_file,
42
+ follow_links=False,
43
+ custom_settings={
44
+ "CLOSESPIDER_TIMEOUT": 30,
45
+ "ROBOTSTXT_OBEY": True,
46
+ "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
47
+ "USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)",
48
+ "DOWNLOAD_TIMEOUT": 10,
49
+ },
50
+ )
51
+ return True
52
+ except Exception as e:
53
+ logger.error(f"Crawl error for {url}: {str(e)}")
54
+ return False
55
+
56
+
57
+ def clean_text(text: str) -> str:
58
+ """Clean and format text by removing extra whitespace and normalizing spacing."""
59
+ if not text:
60
+ return ""
61
+ # Remove extra whitespace and newlines
62
+ text = re.sub(r"[\n\s]+", " ", text)
63
+ # Split camelCase words
64
+ text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
65
+ # Clean extra spaces
66
+ text = " ".join(text.split())
67
+ return text.strip()
68
+
69
+
70
+ def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]:
71
+ """Process a single link-text pair and return markdown if valid."""
72
+ if not url or not text:
73
+ return None
74
+
75
+ url = url.strip()
76
+ text = clean_text(text)
77
+
78
+ if not text or not url or url in seen_links:
79
+ return None
80
+
81
+ seen_links.add(url)
82
+ return f"## {text}\n[{text}]({url})"
83
+
84
+
85
+ def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
86
+ """Process links based on selected types with deduplication."""
87
+ try:
88
+ all_links = []
89
+ seen_links = set() # Track unique URLs
90
+
91
+ if "All links" in link_types or not link_types:
92
+ link_df = adv.crawlytics.links(crawl_df)
93
+ for link, text in link_df[["link", "text"]].dropna().values:
94
+ if md_link := process_link_pair(link, text, seen_links):
95
+ all_links.append(md_link)
96
+ else:
97
+ for link_type in link_types:
98
+ type_match = re.findall(r"header|footer|nav", link_type.lower())
99
+ if type_match:
100
+ col_prefix = type_match[0]
101
+ urls = crawl_df[f"{col_prefix}_links_url"].iloc[0]
102
+ texts = crawl_df[f"{col_prefix}_links_text"].iloc[0]
103
+
104
+ if urls and texts:
105
+ urls = urls.split("@@")
106
+ texts = texts.split("@@")
107
+
108
+ for url, text in zip(urls, texts):
109
+ if md_link := process_link_pair(url, text, seen_links):
110
+ all_links.append(md_link)
111
+
112
+ return "\n\n".join(all_links)
113
+ except Exception as e:
114
+ logger.error(f"Link processing error: {str(e)}")
115
+ return ""
116
+
117
+
118
+ def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
119
+ """Process website URL and generate markdown content."""
120
+ valid, result = validate_url(url)
121
+ if not valid:
122
+ return "", result
123
+
124
+ url = result
125
+ output_file = f"crawl_{token_hex(6)}.jsonl"
126
+
127
+ try:
128
+ if not safe_crawl(url, output_file):
129
+ return "", "Crawl failed or timed out"
130
+
131
+ crawl_df = pd.read_json(output_file, lines=True)
132
+ if crawl_df.empty:
133
+ return "", "No data found for the URL"
134
+
135
+ # Extract and clean title and description
136
+ title = (
137
+ clean_text(crawl_df["title"].iloc[0])
138
+ if "title" in crawl_df.columns
139
+ else "Untitled"
140
+ )
141
+ meta_desc = (
142
+ clean_text(crawl_df["meta_desc"].iloc[0])
143
+ if "meta_desc" in crawl_df.columns
144
+ else ""
145
+ )
146
+
147
+ # Process links
148
+ links_content = process_links(crawl_df, link_types)
149
+
150
+ # Generate final markdown
151
+ content = f"# {title}\n\n"
152
+ if meta_desc:
153
+ content += f"> {meta_desc}\n\n"
154
+ content += links_content
155
+
156
+ return content, f"Successfully processed {url}"
157
+
158
+ except Exception as e:
159
+ logger.error(f"Error processing {url}: {str(e)}")
160
+ return "", f"Error: {str(e)}"
161
+ finally:
162
+ if os.path.exists(output_file):
163
+ os.remove(output_file)
164
+
165
+
166
+ def process_file(file: gr.File) -> Tuple[str, str]:
167
+ """Convert uploaded file to markdown."""
168
+ if not file:
169
+ return "", "No file uploaded"
170
+
171
+ supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"}
172
+ file_ext = os.path.splitext(file.name)[1].lower()
173
+
174
+ if file_ext not in supported_extensions:
175
+ return "", f"Unsupported file type: {file_ext}"
176
+
177
+ try:
178
+ result = md_converter.convert(file.name)
179
+ return result.text_content, "File processed successfully"
180
+ except Exception as e:
181
+ logger.error(f"File processing error: {str(e)}")
182
+ return "", f"Error processing file: {str(e)}"
183
+
184
+
185
+ # Custom CSS for styling
186
+ css = """
187
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
188
+
189
+ body {
190
+ font-family: 'Open Sans', sans-serif !important;
191
+ }
192
+
193
+ .primary-btn {
194
+ background-color: #3452db !important;
195
+ }
196
+
197
+ .primary-btn:hover {
198
+ background-color: #2a41af !important;
199
+ }
200
+ """
201
+
202
+ # Create a custom theme
203
+ theme = gr.themes.Soft(
204
+ primary_hue=gr.themes.colors.Color(
205
+ name="blue",
206
+ c50="#eef1ff",
207
+ c100="#e0e5ff",
208
+ c200="#c3cbff",
209
+ c300="#a5b2ff",
210
+ c400="#8798ff",
211
+ c500="#6a7eff",
212
+ c600="#3452db",
213
+ c700="#2a41af",
214
+ c800="#1f3183",
215
+ c900="#152156",
216
+ c950="#0a102b",
217
+ )
218
+ )
219
+
220
+ # Create interface
221
+ with gr.Blocks(
222
+ theme=gr.themes.Soft(),
223
+ css=css,
224
+ head="""
225
+ <link rel="canonical" href="https://wordlift.io/generate-llms-txt/" />
226
+ <meta name="description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
227
+ <meta property="og:title" content="LLMs.txt Generator by WordLift" />
228
+ <meta property="og:description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
229
+ <meta property="og:url" content="https://wordlift.io/generate-llms-txt/" />
230
+ """,
231
+ ) as iface:
232
+ gr.Markdown("# LLMs.txt Generator")
233
+
234
+ with gr.Tab("Website URL"):
235
+ url_input = gr.Textbox(label="Website URL", placeholder="example.com")
236
+ link_types = gr.Dropdown(
237
+ choices=["All links", "<header> links", "<nav> links", "<footer> links"],
238
+ multiselect=True,
239
+ value=["All links"],
240
+ label="Link Types to Extract",
241
+ )
242
+ url_button = gr.Button("Process URL", variant="primary")
243
+ url_output = gr.Textbox(
244
+ label="Generated Content", lines=20, show_copy_button=True
245
+ )
246
+ url_status = gr.Textbox(label="Status")
247
+
248
+ url_button.click(
249
+ process_url,
250
+ inputs=[url_input, link_types],
251
+ outputs=[url_output, url_status],
252
+ )
253
+
254
+ with gr.Tab("File Converter"):
255
+ file_input = gr.File(label="Upload Document")
256
+ file_button = gr.Button("Convert to Markdown", variant="primary")
257
+ file_output = gr.Textbox(
258
+ label="Converted Content", lines=20, show_copy_button=True
259
+ )
260
+ file_status = gr.Textbox(label="Status")
261
+
262
+ file_button.click(
263
+ process_file, inputs=[file_input], outputs=[file_output, file_status]
264
+ )
265
+
266
+ if __name__ == "__main__":
267
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy>=1.23.5
2
+ pandas>=1.5.0
3
+ scipy>=1.10.0
4
+ advertools
5
+ markitdown
6
+ validators
7
+ gradio