mgbam commited on
Commit
bf0d7be
·
verified ·
1 Parent(s): 7833311

Rename web_extraction.py to deployment.py

Browse files
Files changed (2) hide show
  1. deployment.py +120 -0
  2. web_extraction.py +0 -271
deployment.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /deployment.py
2
+
3
+ """
4
+ Handles deployment of generated code to Hugging Face Spaces.
5
+
6
+ This module provides functions to wrap generated code into a runnable
7
+ Gradio or Static HTML app and to programmatically create and upload
8
+ it to a user's Hugging Face Space.
9
+ """
10
+ import tempfile
11
+ import webbrowser
12
+ import logging
13
+ from urllib.parse import urlencode
14
+
15
+ from huggingface_hub import HfApi, HfFolder
16
+ import gradio as gr
17
+
18
+ def _create_space_readme(space_name: str, sdk: str) -> str:
19
+ """Generates a standard README.md file for the new Space."""
20
+ return f"""---
21
+ title: {space_name}
22
+ emoji: 🚀
23
+ colorFrom: blue
24
+ colorTo: green
25
+ sdk: {sdk}
26
+ ---
27
+
28
+ # {space_name}
29
+
30
+ This Space was generated by [AnyCoder](<YOUR_APP_SPACE_URL>).
31
+ """
32
+
33
+ def deploy_to_hf_space(
34
+ code: str,
35
+ space_name: str,
36
+ sdk: str,
37
+ hf_token: str
38
+ ) -> str:
39
+ """
40
+ Creates or updates a Hugging Face Space and uploads the generated code.
41
+
42
+ Args:
43
+ code: The code to deploy (HTML or Python).
44
+ space_name: The desired name for the Space.
45
+ sdk: The SDK for the Space ('static', 'gradio', 'streamlit').
46
+ hf_token: The user's Hugging Face write token.
47
+
48
+ Returns:
49
+ A success or error message with a link to the Space.
50
+ """
51
+ if not code or not code.strip():
52
+ return "Cannot deploy: No code has been generated."
53
+ if not space_name or not space_name.strip():
54
+ return "Cannot deploy: Please provide a name for your app."
55
+ if not hf_token:
56
+ # Fallback to URL-based deployment if no token is provided
57
+ return deploy_via_url(code, space_name, sdk)
58
+
59
+ try:
60
+ api = HfApi(token=hf_token)
61
+ user_info = api.whoami(token=hf_token)
62
+ username = user_info['name']
63
+ repo_id = f"{username}/{space_name.strip()}"
64
+
65
+ api.create_repo(repo_id, repo_type="space", space_sdk=sdk, exist_ok=True)
66
+
67
+ if sdk == 'static':
68
+ file_content = code
69
+ file_path_in_repo = "index.html"
70
+ else: # gradio or streamlit
71
+ file_content = code # Assume code is already wrapped for Python
72
+ file_path_in_repo = "app.py"
73
+
74
+ # Upload the main app file
75
+ api.upload_file(
76
+ path_or_fileobj=file_content.encode('utf-8'),
77
+ path_in_repo=file_path_in_repo,
78
+ repo_id=repo_id,
79
+ repo_type="space"
80
+ )
81
+ # Upload a README
82
+ readme_content = _create_space_readme(space_name, sdk)
83
+ api.upload_file(
84
+ path_or_fileobj=readme_content.encode('utf-8'),
85
+ path_in_repo="README.md",
86
+ repo_id=repo_id,
87
+ repo_type="space"
88
+ )
89
+
90
+ space_url = f"https://huggingface.co/spaces/{repo_id}"
91
+ return f"✅ Deployed successfully! [Open your Space]({space_url})"
92
+
93
+ except Exception as e:
94
+ logging.error(f"Failed to deploy to Hugging Face Space: {e}")
95
+ return f"❌ Deployment failed: {str(e)}"
96
+
97
+ def deploy_via_url(code: str, space_name: str, sdk: str) -> str:
98
+ """
99
+ Opens a new browser tab with pre-filled parameters to create a Space.
100
+ This is a fallback for users who are not logged in via OAuth.
101
+ """
102
+ if sdk == 'static':
103
+ app_file = "index.html"
104
+ content = code
105
+ else:
106
+ app_file = "app.py"
107
+ # Basic wrapping for Python-based SDKs if needed
108
+ content = code # Assuming code is python string
109
+
110
+ params = urlencode({
111
+ "name": space_name,
112
+ "sdk": sdk,
113
+ "files[0][path]": app_file,
114
+ "files[0][content]": content
115
+ })
116
+ base_url = "https://huggingface.co/new-space"
117
+ full_url = f"{base_url}?{params}"
118
+
119
+ webbrowser.open_new_tab(full_url)
120
+ return "🚀 Your app is ready to launch! Check the new browser tab."
web_extraction.py DELETED
@@ -1,271 +0,0 @@
1
- import requests
2
- from urllib.parse import urlparse, urljoin, ParseResult
3
- from bs4 import BeautifulSoup
4
- import re
5
- from tavily import TavilyClient
6
- import os
7
-
8
- tavily_client = None
9
- TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
10
- if TAVILY_API_KEY:
11
- import logging
12
- try:
13
- tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
14
- except Exception as e:
15
- print(f"Failed to initialize Tavily client: {e}")
16
-
17
- def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
18
- """Perform web search using Tavily with default parameters"""
19
- if not tavily_client:
20
- return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
21
-
22
- try:
23
- # Use Tavily defaults with advanced search depth for better results
24
- search_params = {
25
- "search_depth": "advanced",
26
- "max_results": min(max(1, max_results), 20)
27
- }
28
- if include_domains is not None:
29
- search_params["include_domains"] = include_domains
30
- if exclude_domains is not None:
31
- search_params["exclude_domains"] = exclude_domains
32
-
33
- response = tavily_client.search(query, **search_params)
34
-
35
- search_results = []
36
- for result in response.get('results', []):
37
- title = result.get('title', 'No title')
38
- url = result.get('url', 'No URL')
39
- content = result.get('content', 'No content')
40
- search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
41
-
42
- if search_results:
43
- return "Web Search Results:\n\n" + "\n---\n".join(search_results)
44
- else:
45
- return "No search results found."
46
-
47
- except Exception as e:
48
- return f"Search error: {str(e)}"
49
-
50
- def enhance_query_with_search(query: str, enable_search: bool) -> str:
51
- """Enhance the query with web search results if search is enabled"""
52
- if not enable_search or not tavily_client:
53
- return query
54
-
55
- # Perform search to get relevant information
56
- search_results = perform_web_search(query)
57
-
58
- # Combine original query with search results
59
- enhanced_query = f"""Original Query: {query}
60
- {search_results}
61
- Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
62
-
63
- return enhanced_query
64
-
65
- def extract_website_content(url: str) -> str:
66
- """Extract HTML code and content from a website URL"""
67
- try:
68
- # Validate URL
69
- parsed_url = urlparse(url)
70
- if not parsed_url.scheme:
71
- url = "https://" + url
72
- parsed_url = urlparse(url)
73
-
74
- if not parsed_url.netloc:
75
- return "Error: Invalid URL provided"
76
-
77
- # Set comprehensive headers to mimic a real browser request
78
- scheme = parsed_url.scheme
79
- netloc = parsed_url.netloc
80
- path = parsed_url.path if parsed_url.path else "/"
81
- params = parsed_url.params
82
- query = parsed_url.query
83
- fragment = parsed_url.fragment
84
- reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()
85
-
86
- logging.info(f"Extracting content from: {reconstructed_url}")
87
-
88
- if reconstructed_url != url:
89
- logging.info(f"Original URL: {url}")
90
- logging.info(f"Reconstructed URL: {reconstructed_url}")
91
-
92
-
93
- headers = {
94
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
95
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
96
- 'Accept-Language': 'en-US,en;q=0.9',
97
- 'Accept-Encoding': 'gzip, deflate, br',
98
- 'DNT': '1',
99
- 'Connection': 'keep-alive',
100
- 'Upgrade-Insecure-Requests': '1',
101
- 'Sec-Fetch-Dest': 'document',
102
- 'Sec-Fetch-Mode': 'navigate',
103
- 'Sec-Fetch-Site': 'none',
104
- 'Sec-Fetch-User': '?1',
105
- 'Cache-Control': 'max-age=0'
106
- }
107
-
108
- # Create a session to maintain cookies and handle redirects
109
- session = requests.Session()
110
- session.headers.update(headers)
111
-
112
- # Make the request with retry logic
113
- max_retries = 3
114
- for attempt in range(max_retries):
115
- try:
116
- response = session.get(url, timeout=15, allow_redirects=True)
117
- response.raise_for_status()
118
- break # Exit the loop if successful
119
- except requests.exceptions.HTTPError as e:
120
- if e.response.status_code == 403 and attempt < max_retries - 1:
121
- # Try with different User-Agent on 403
122
- session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
123
- continue
124
- else:
125
- raise
126
-
127
- # Get the raw HTML content with proper encoding
128
- try:
129
- # Try to get the content with automatic encoding detection
130
- response.encoding = response.apparent_encoding
131
- raw_html = response.text
132
- except:
133
- # Fallback to UTF-8 if encoding detection fails
134
- raw_html = response.content.decode('utf-8', errors='ignore')
135
-
136
- # Debug: Check if we got valid HTML
137
- if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
138
- print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
139
-
140
- # Try alternative approaches
141
- try:
142
- raw_html = response.content.decode('latin-1', errors='ignore')
143
- except:
144
- try:
145
- raw_html = response.content.decode('utf-8', errors='ignore')
146
- except:
147
- raw_html = response.content.decode('cp1252', errors='ignore')
148
-
149
- # Parse HTML content for analysis
150
- soup = BeautifulSoup(raw_html, 'html.parser')
151
-
152
- # Check if this is a JavaScript-heavy site
153
- script_tags = soup.find_all('script')
154
- if len(script_tags) > 10:
155
- print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
156
- # Attempt to use Playwright to render the page and get full HTML
157
- try:
158
- from playwright.sync_api import sync_playwright
159
- with sync_playwright() as p:
160
- browser = p.chromium.launch()
161
- page = browser.new_page()
162
- page.goto(url, timeout=30000)
163
- page.wait_for_load_state("networkidle")
164
- rendered_html = page.content()
165
- browser.close()
166
- soup = BeautifulSoup(rendered_html, 'html.parser')
167
- except Exception as e:
168
- print(f"Playwright rendering failed: {e}")
169
-
170
- # Extract title, meta description, etc.
171
- title = soup.find('title')
172
- title_text = title.get_text().strip() if title else "No title found"
173
- meta_desc = soup.find('meta', attrs={'name': 'description'})
174
- description = meta_desc.get('content', '') if meta_desc else ""
175
-
176
- # Fix image URLs
177
- for img in soup.find_all('img'):
178
- src = img.get('src', '')
179
- if src:
180
- img['src'] = urljoin(url, src)
181
-
182
- # Fix background images in style attributes
183
- for element in soup.find_all(attrs={'style': True}):
184
- style_attr = element.get('style', '')
185
- bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
186
- matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
187
- for match in matches:
188
- if not match.startswith(('http', '//', 'data:')):
189
- style_attr = style_attr.replace(match, urljoin(url, match))
190
- element['style'] = style_attr
191
-
192
- # Fix background images in <style> tags
193
- for style in soup.find_all('style'):
194
- if style.string:
195
- style_content = style.string
196
- bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
197
- matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
198
- for match in matches:
199
- if not match.startswith(('http', '//', 'data:')):
200
- style_content = style_content.replace(match, urljoin(url, match))
201
- style.string = style_content
202
-
203
- # Test a few image URLs to see if they're accessible
204
- def test_image_url(img_url):
205
- try:
206
- test_response = requests.head(img_url, timeout=5, allow_redirects=True)
207
- return test_response.status_code == 200
208
- except:
209
- return False
210
-
211
- working_images = []
212
- for img in soup.find_all('img')[:10]:
213
- if test_image_url(img['src']):
214
- working_images.append(img)
215
-
216
- modified_html = str(soup)
217
- cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
218
- cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
219
- cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
220
-
221
- if len(cleaned_html) > 15000:
222
- cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
223
-
224
- if len(cleaned_html.strip()) < 100:
225
- website_content = f"""
226
- WEBSITE REDESIGN - EXTRACTION FAILED
227
- ====================================
228
- URL: {url}
229
- Title: {title_text}
230
- ERROR: Could not extract meaningful HTML content from this website. This could be due to:
231
- 1. The website uses heavy JavaScript to load content dynamically
232
- 2. The website has anti-bot protection
233
- 3. The website requires authentication
234
- FALLBACK APPROACH:
235
- Please create a modern, responsive website design for a {title_text.lower()} website."""
236
- return website_content.strip()
237
-
238
- website_content = f"""
239
- WEBSITE REDESIGN - ORIGINAL HTML CODE
240
- =====================================
241
- URL: {url}
242
- Title: {title_text}
243
- Description: {description}
244
- IMAGES FOUND (use these exact URLs in your redesign):
245
- {chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
246
- ORIGINAL HTML CODE (use this as the base for redesign):
247
- ```html
248
- {cleaned_html}
249
- ```
250
- REDESIGN INSTRUCTIONS:
251
- Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
252
-
253
- return website_content.strip()
254
-
255
- except requests.exceptions.HTTPError as e:
256
- if e.response.status_code == 403:
257
- return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
258
- elif e.response.status_code == 404:
259
- return f"Error: Website not found (404). Please check the URL and try again."
260
- elif e.response.status_code >= 500:
261
- return f"Error: Website server error ({e.response.status_code}). Please try again later."
262
- else:
263
- return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
264
- except requests.exceptions.Timeout:
265
- return "Error: Request timed out. The website may be slow or unavailable."
266
- except requests.exceptions.ConnectionError:
267
- return "Error: Could not connect to the website. Please check your internet connection and the URL."
268
- except requests.exceptions.RequestException as e:
269
- return f"Error accessing website: {str(e)}"
270
- except Exception as e:
271
- return f"Error extracting website content: {str(e)}"