Rename web_extraction.py to deployment.py
Browse files- deployment.py +120 -0
- web_extraction.py +0 -271
deployment.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /deployment.py
|
2 |
+
|
3 |
+
"""
|
4 |
+
Handles deployment of generated code to Hugging Face Spaces.
|
5 |
+
|
6 |
+
This module provides functions to wrap generated code into a runnable
|
7 |
+
Gradio or Static HTML app and to programmatically create and upload
|
8 |
+
it to a user's Hugging Face Space.
|
9 |
+
"""
|
10 |
+
import tempfile
|
11 |
+
import webbrowser
|
12 |
+
import logging
|
13 |
+
from urllib.parse import urlencode
|
14 |
+
|
15 |
+
from huggingface_hub import HfApi, HfFolder
|
16 |
+
import gradio as gr
|
17 |
+
|
18 |
+
def _create_space_readme(space_name: str, sdk: str) -> str:
|
19 |
+
"""Generates a standard README.md file for the new Space."""
|
20 |
+
return f"""---
|
21 |
+
title: {space_name}
|
22 |
+
emoji: 🚀
|
23 |
+
colorFrom: blue
|
24 |
+
colorTo: green
|
25 |
+
sdk: {sdk}
|
26 |
+
---
|
27 |
+
|
28 |
+
# {space_name}
|
29 |
+
|
30 |
+
This Space was generated by [AnyCoder](<YOUR_APP_SPACE_URL>).
|
31 |
+
"""
|
32 |
+
|
33 |
+
def deploy_to_hf_space(
|
34 |
+
code: str,
|
35 |
+
space_name: str,
|
36 |
+
sdk: str,
|
37 |
+
hf_token: str
|
38 |
+
) -> str:
|
39 |
+
"""
|
40 |
+
Creates or updates a Hugging Face Space and uploads the generated code.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
code: The code to deploy (HTML or Python).
|
44 |
+
space_name: The desired name for the Space.
|
45 |
+
sdk: The SDK for the Space ('static', 'gradio', 'streamlit').
|
46 |
+
hf_token: The user's Hugging Face write token.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
A success or error message with a link to the Space.
|
50 |
+
"""
|
51 |
+
if not code or not code.strip():
|
52 |
+
return "Cannot deploy: No code has been generated."
|
53 |
+
if not space_name or not space_name.strip():
|
54 |
+
return "Cannot deploy: Please provide a name for your app."
|
55 |
+
if not hf_token:
|
56 |
+
# Fallback to URL-based deployment if no token is provided
|
57 |
+
return deploy_via_url(code, space_name, sdk)
|
58 |
+
|
59 |
+
try:
|
60 |
+
api = HfApi(token=hf_token)
|
61 |
+
user_info = api.whoami(token=hf_token)
|
62 |
+
username = user_info['name']
|
63 |
+
repo_id = f"{username}/{space_name.strip()}"
|
64 |
+
|
65 |
+
api.create_repo(repo_id, repo_type="space", space_sdk=sdk, exist_ok=True)
|
66 |
+
|
67 |
+
if sdk == 'static':
|
68 |
+
file_content = code
|
69 |
+
file_path_in_repo = "index.html"
|
70 |
+
else: # gradio or streamlit
|
71 |
+
file_content = code # Assume code is already wrapped for Python
|
72 |
+
file_path_in_repo = "app.py"
|
73 |
+
|
74 |
+
# Upload the main app file
|
75 |
+
api.upload_file(
|
76 |
+
path_or_fileobj=file_content.encode('utf-8'),
|
77 |
+
path_in_repo=file_path_in_repo,
|
78 |
+
repo_id=repo_id,
|
79 |
+
repo_type="space"
|
80 |
+
)
|
81 |
+
# Upload a README
|
82 |
+
readme_content = _create_space_readme(space_name, sdk)
|
83 |
+
api.upload_file(
|
84 |
+
path_or_fileobj=readme_content.encode('utf-8'),
|
85 |
+
path_in_repo="README.md",
|
86 |
+
repo_id=repo_id,
|
87 |
+
repo_type="space"
|
88 |
+
)
|
89 |
+
|
90 |
+
space_url = f"https://huggingface.co/spaces/{repo_id}"
|
91 |
+
return f"✅ Deployed successfully! [Open your Space]({space_url})"
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
logging.error(f"Failed to deploy to Hugging Face Space: {e}")
|
95 |
+
return f"❌ Deployment failed: {str(e)}"
|
96 |
+
|
97 |
+
def deploy_via_url(code: str, space_name: str, sdk: str) -> str:
|
98 |
+
"""
|
99 |
+
Opens a new browser tab with pre-filled parameters to create a Space.
|
100 |
+
This is a fallback for users who are not logged in via OAuth.
|
101 |
+
"""
|
102 |
+
if sdk == 'static':
|
103 |
+
app_file = "index.html"
|
104 |
+
content = code
|
105 |
+
else:
|
106 |
+
app_file = "app.py"
|
107 |
+
# Basic wrapping for Python-based SDKs if needed
|
108 |
+
content = code # Assuming code is python string
|
109 |
+
|
110 |
+
params = urlencode({
|
111 |
+
"name": space_name,
|
112 |
+
"sdk": sdk,
|
113 |
+
"files[0][path]": app_file,
|
114 |
+
"files[0][content]": content
|
115 |
+
})
|
116 |
+
base_url = "https://huggingface.co/new-space"
|
117 |
+
full_url = f"{base_url}?{params}"
|
118 |
+
|
119 |
+
webbrowser.open_new_tab(full_url)
|
120 |
+
return "🚀 Your app is ready to launch! Check the new browser tab."
|
web_extraction.py
DELETED
@@ -1,271 +0,0 @@
|
|
1 |
-
import requests
|
2 |
-
from urllib.parse import urlparse, urljoin, ParseResult
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
import re
|
5 |
-
from tavily import TavilyClient
|
6 |
-
import os
|
7 |
-
|
8 |
-
tavily_client = None
|
9 |
-
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
|
10 |
-
if TAVILY_API_KEY:
|
11 |
-
import logging
|
12 |
-
try:
|
13 |
-
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
14 |
-
except Exception as e:
|
15 |
-
print(f"Failed to initialize Tavily client: {e}")
|
16 |
-
|
17 |
-
def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
|
18 |
-
"""Perform web search using Tavily with default parameters"""
|
19 |
-
if not tavily_client:
|
20 |
-
return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
|
21 |
-
|
22 |
-
try:
|
23 |
-
# Use Tavily defaults with advanced search depth for better results
|
24 |
-
search_params = {
|
25 |
-
"search_depth": "advanced",
|
26 |
-
"max_results": min(max(1, max_results), 20)
|
27 |
-
}
|
28 |
-
if include_domains is not None:
|
29 |
-
search_params["include_domains"] = include_domains
|
30 |
-
if exclude_domains is not None:
|
31 |
-
search_params["exclude_domains"] = exclude_domains
|
32 |
-
|
33 |
-
response = tavily_client.search(query, **search_params)
|
34 |
-
|
35 |
-
search_results = []
|
36 |
-
for result in response.get('results', []):
|
37 |
-
title = result.get('title', 'No title')
|
38 |
-
url = result.get('url', 'No URL')
|
39 |
-
content = result.get('content', 'No content')
|
40 |
-
search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
|
41 |
-
|
42 |
-
if search_results:
|
43 |
-
return "Web Search Results:\n\n" + "\n---\n".join(search_results)
|
44 |
-
else:
|
45 |
-
return "No search results found."
|
46 |
-
|
47 |
-
except Exception as e:
|
48 |
-
return f"Search error: {str(e)}"
|
49 |
-
|
50 |
-
def enhance_query_with_search(query: str, enable_search: bool) -> str:
|
51 |
-
"""Enhance the query with web search results if search is enabled"""
|
52 |
-
if not enable_search or not tavily_client:
|
53 |
-
return query
|
54 |
-
|
55 |
-
# Perform search to get relevant information
|
56 |
-
search_results = perform_web_search(query)
|
57 |
-
|
58 |
-
# Combine original query with search results
|
59 |
-
enhanced_query = f"""Original Query: {query}
|
60 |
-
{search_results}
|
61 |
-
Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
|
62 |
-
|
63 |
-
return enhanced_query
|
64 |
-
|
65 |
-
def extract_website_content(url: str) -> str:
|
66 |
-
"""Extract HTML code and content from a website URL"""
|
67 |
-
try:
|
68 |
-
# Validate URL
|
69 |
-
parsed_url = urlparse(url)
|
70 |
-
if not parsed_url.scheme:
|
71 |
-
url = "https://" + url
|
72 |
-
parsed_url = urlparse(url)
|
73 |
-
|
74 |
-
if not parsed_url.netloc:
|
75 |
-
return "Error: Invalid URL provided"
|
76 |
-
|
77 |
-
# Set comprehensive headers to mimic a real browser request
|
78 |
-
scheme = parsed_url.scheme
|
79 |
-
netloc = parsed_url.netloc
|
80 |
-
path = parsed_url.path if parsed_url.path else "/"
|
81 |
-
params = parsed_url.params
|
82 |
-
query = parsed_url.query
|
83 |
-
fragment = parsed_url.fragment
|
84 |
-
reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()
|
85 |
-
|
86 |
-
logging.info(f"Extracting content from: {reconstructed_url}")
|
87 |
-
|
88 |
-
if reconstructed_url != url:
|
89 |
-
logging.info(f"Original URL: {url}")
|
90 |
-
logging.info(f"Reconstructed URL: {reconstructed_url}")
|
91 |
-
|
92 |
-
|
93 |
-
headers = {
|
94 |
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
95 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
96 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
97 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
98 |
-
'DNT': '1',
|
99 |
-
'Connection': 'keep-alive',
|
100 |
-
'Upgrade-Insecure-Requests': '1',
|
101 |
-
'Sec-Fetch-Dest': 'document',
|
102 |
-
'Sec-Fetch-Mode': 'navigate',
|
103 |
-
'Sec-Fetch-Site': 'none',
|
104 |
-
'Sec-Fetch-User': '?1',
|
105 |
-
'Cache-Control': 'max-age=0'
|
106 |
-
}
|
107 |
-
|
108 |
-
# Create a session to maintain cookies and handle redirects
|
109 |
-
session = requests.Session()
|
110 |
-
session.headers.update(headers)
|
111 |
-
|
112 |
-
# Make the request with retry logic
|
113 |
-
max_retries = 3
|
114 |
-
for attempt in range(max_retries):
|
115 |
-
try:
|
116 |
-
response = session.get(url, timeout=15, allow_redirects=True)
|
117 |
-
response.raise_for_status()
|
118 |
-
break # Exit the loop if successful
|
119 |
-
except requests.exceptions.HTTPError as e:
|
120 |
-
if e.response.status_code == 403 and attempt < max_retries - 1:
|
121 |
-
# Try with different User-Agent on 403
|
122 |
-
session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
123 |
-
continue
|
124 |
-
else:
|
125 |
-
raise
|
126 |
-
|
127 |
-
# Get the raw HTML content with proper encoding
|
128 |
-
try:
|
129 |
-
# Try to get the content with automatic encoding detection
|
130 |
-
response.encoding = response.apparent_encoding
|
131 |
-
raw_html = response.text
|
132 |
-
except:
|
133 |
-
# Fallback to UTF-8 if encoding detection fails
|
134 |
-
raw_html = response.content.decode('utf-8', errors='ignore')
|
135 |
-
|
136 |
-
# Debug: Check if we got valid HTML
|
137 |
-
if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
|
138 |
-
print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
|
139 |
-
|
140 |
-
# Try alternative approaches
|
141 |
-
try:
|
142 |
-
raw_html = response.content.decode('latin-1', errors='ignore')
|
143 |
-
except:
|
144 |
-
try:
|
145 |
-
raw_html = response.content.decode('utf-8', errors='ignore')
|
146 |
-
except:
|
147 |
-
raw_html = response.content.decode('cp1252', errors='ignore')
|
148 |
-
|
149 |
-
# Parse HTML content for analysis
|
150 |
-
soup = BeautifulSoup(raw_html, 'html.parser')
|
151 |
-
|
152 |
-
# Check if this is a JavaScript-heavy site
|
153 |
-
script_tags = soup.find_all('script')
|
154 |
-
if len(script_tags) > 10:
|
155 |
-
print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
|
156 |
-
# Attempt to use Playwright to render the page and get full HTML
|
157 |
-
try:
|
158 |
-
from playwright.sync_api import sync_playwright
|
159 |
-
with sync_playwright() as p:
|
160 |
-
browser = p.chromium.launch()
|
161 |
-
page = browser.new_page()
|
162 |
-
page.goto(url, timeout=30000)
|
163 |
-
page.wait_for_load_state("networkidle")
|
164 |
-
rendered_html = page.content()
|
165 |
-
browser.close()
|
166 |
-
soup = BeautifulSoup(rendered_html, 'html.parser')
|
167 |
-
except Exception as e:
|
168 |
-
print(f"Playwright rendering failed: {e}")
|
169 |
-
|
170 |
-
# Extract title, meta description, etc.
|
171 |
-
title = soup.find('title')
|
172 |
-
title_text = title.get_text().strip() if title else "No title found"
|
173 |
-
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
174 |
-
description = meta_desc.get('content', '') if meta_desc else ""
|
175 |
-
|
176 |
-
# Fix image URLs
|
177 |
-
for img in soup.find_all('img'):
|
178 |
-
src = img.get('src', '')
|
179 |
-
if src:
|
180 |
-
img['src'] = urljoin(url, src)
|
181 |
-
|
182 |
-
# Fix background images in style attributes
|
183 |
-
for element in soup.find_all(attrs={'style': True}):
|
184 |
-
style_attr = element.get('style', '')
|
185 |
-
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
186 |
-
matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
|
187 |
-
for match in matches:
|
188 |
-
if not match.startswith(('http', '//', 'data:')):
|
189 |
-
style_attr = style_attr.replace(match, urljoin(url, match))
|
190 |
-
element['style'] = style_attr
|
191 |
-
|
192 |
-
# Fix background images in <style> tags
|
193 |
-
for style in soup.find_all('style'):
|
194 |
-
if style.string:
|
195 |
-
style_content = style.string
|
196 |
-
bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
|
197 |
-
matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
|
198 |
-
for match in matches:
|
199 |
-
if not match.startswith(('http', '//', 'data:')):
|
200 |
-
style_content = style_content.replace(match, urljoin(url, match))
|
201 |
-
style.string = style_content
|
202 |
-
|
203 |
-
# Test a few image URLs to see if they're accessible
|
204 |
-
def test_image_url(img_url):
|
205 |
-
try:
|
206 |
-
test_response = requests.head(img_url, timeout=5, allow_redirects=True)
|
207 |
-
return test_response.status_code == 200
|
208 |
-
except:
|
209 |
-
return False
|
210 |
-
|
211 |
-
working_images = []
|
212 |
-
for img in soup.find_all('img')[:10]:
|
213 |
-
if test_image_url(img['src']):
|
214 |
-
working_images.append(img)
|
215 |
-
|
216 |
-
modified_html = str(soup)
|
217 |
-
cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
|
218 |
-
cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
|
219 |
-
cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
|
220 |
-
|
221 |
-
if len(cleaned_html) > 15000:
|
222 |
-
cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
|
223 |
-
|
224 |
-
if len(cleaned_html.strip()) < 100:
|
225 |
-
website_content = f"""
|
226 |
-
WEBSITE REDESIGN - EXTRACTION FAILED
|
227 |
-
====================================
|
228 |
-
URL: {url}
|
229 |
-
Title: {title_text}
|
230 |
-
ERROR: Could not extract meaningful HTML content from this website. This could be due to:
|
231 |
-
1. The website uses heavy JavaScript to load content dynamically
|
232 |
-
2. The website has anti-bot protection
|
233 |
-
3. The website requires authentication
|
234 |
-
FALLBACK APPROACH:
|
235 |
-
Please create a modern, responsive website design for a {title_text.lower()} website."""
|
236 |
-
return website_content.strip()
|
237 |
-
|
238 |
-
website_content = f"""
|
239 |
-
WEBSITE REDESIGN - ORIGINAL HTML CODE
|
240 |
-
=====================================
|
241 |
-
URL: {url}
|
242 |
-
Title: {title_text}
|
243 |
-
Description: {description}
|
244 |
-
IMAGES FOUND (use these exact URLs in your redesign):
|
245 |
-
{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
|
246 |
-
ORIGINAL HTML CODE (use this as the base for redesign):
|
247 |
-
```html
|
248 |
-
{cleaned_html}
|
249 |
-
```
|
250 |
-
REDESIGN INSTRUCTIONS:
|
251 |
-
Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
|
252 |
-
|
253 |
-
return website_content.strip()
|
254 |
-
|
255 |
-
except requests.exceptions.HTTPError as e:
|
256 |
-
if e.response.status_code == 403:
|
257 |
-
return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
|
258 |
-
elif e.response.status_code == 404:
|
259 |
-
return f"Error: Website not found (404). Please check the URL and try again."
|
260 |
-
elif e.response.status_code >= 500:
|
261 |
-
return f"Error: Website server error ({e.response.status_code}). Please try again later."
|
262 |
-
else:
|
263 |
-
return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
|
264 |
-
except requests.exceptions.Timeout:
|
265 |
-
return "Error: Request timed out. The website may be slow or unavailable."
|
266 |
-
except requests.exceptions.ConnectionError:
|
267 |
-
return "Error: Could not connect to the website. Please check your internet connection and the URL."
|
268 |
-
except requests.exceptions.RequestException as e:
|
269 |
-
return f"Error accessing website: {str(e)}"
|
270 |
-
except Exception as e:
|
271 |
-
return f"Error extracting website content: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|