Spaces:

mgbam
/

builder

Running

App Files Files Community

mgbam commited on about 1 month ago

Commit

bf0d7be

verified ·

1 Parent(s): 7833311

Rename web_extraction.py to deployment.py

Browse files

Files changed (2) hide show

deployment.py +120 -0
web_extraction.py +0 -271

deployment.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# /deployment.py
+"""
+Handles deployment of generated code to Hugging Face Spaces.
+This module provides functions to wrap generated code into a runnable
+Gradio or Static HTML app and to programmatically create and upload
+it to a user's Hugging Face Space.
+"""
+import tempfile
+import webbrowser
+import logging
+from urllib.parse import urlencode
+from huggingface_hub import HfApi, HfFolder
+import gradio as gr
+def _create_space_readme(space_name: str, sdk: str) -> str:
+    """Generates a standard README.md file for the new Space."""
+    return f"""---
+title: {space_name}
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: {sdk}
+---
+# {space_name}
+This Space was generated by [AnyCoder](<YOUR_APP_SPACE_URL>).
+"""
+def deploy_to_hf_space(
+    code: str,
+    space_name: str,
+    sdk: str,
+    hf_token: str
+) -> str:
+    """
+    Creates or updates a Hugging Face Space and uploads the generated code.
+    Args:
+        code: The code to deploy (HTML or Python).
+        space_name: The desired name for the Space.
+        sdk: The SDK for the Space ('static', 'gradio', 'streamlit').
+        hf_token: The user's Hugging Face write token.
+    Returns:
+        A success or error message with a link to the Space.
+    """
+    if not code or not code.strip():
+        return "Cannot deploy: No code has been generated."
+    if not space_name or not space_name.strip():
+        return "Cannot deploy: Please provide a name for your app."
+    if not hf_token:
+        # Fallback to URL-based deployment if no token is provided
+        return deploy_via_url(code, space_name, sdk)
+    try:
+        api = HfApi(token=hf_token)
+        user_info = api.whoami(token=hf_token)
+        username = user_info['name']
+        repo_id = f"{username}/{space_name.strip()}"
+        api.create_repo(repo_id, repo_type="space", space_sdk=sdk, exist_ok=True)
+        if sdk == 'static':
+            file_content = code
+            file_path_in_repo = "index.html"
+        else: # gradio or streamlit
+            file_content = code # Assume code is already wrapped for Python
+            file_path_in_repo = "app.py"
+        # Upload the main app file
+        api.upload_file(
+            path_or_fileobj=file_content.encode('utf-8'),
+            path_in_repo=file_path_in_repo,
+            repo_id=repo_id,
+            repo_type="space"
+        )
+        # Upload a README
+        readme_content = _create_space_readme(space_name, sdk)
+        api.upload_file(
+            path_or_fileobj=readme_content.encode('utf-8'),
+            path_in_repo="README.md",
+            repo_id=repo_id,
+            repo_type="space"
+        )
+        space_url = f"https://huggingface.co/spaces/{repo_id}"
+        return f"✅ Deployed successfully! [Open your Space]({space_url})"
+    except Exception as e:
+        logging.error(f"Failed to deploy to Hugging Face Space: {e}")
+        return f"❌ Deployment failed: {str(e)}"
+def deploy_via_url(code: str, space_name: str, sdk: str) -> str:
+    """
+    Opens a new browser tab with pre-filled parameters to create a Space.
+    This is a fallback for users who are not logged in via OAuth.
+    """
+    if sdk == 'static':
+        app_file = "index.html"
+        content = code
+    else:
+        app_file = "app.py"
+        # Basic wrapping for Python-based SDKs if needed
+        content = code # Assuming code is python string
+    params = urlencode({
+        "name": space_name,
+        "sdk": sdk,
+        "files[0][path]": app_file,
+        "files[0][content]": content
+    })
+    base_url = "https://huggingface.co/new-space"
+    full_url = f"{base_url}?{params}"
+    webbrowser.open_new_tab(full_url)
+    return "🚀 Your app is ready to launch! Check the new browser tab."

web_extraction.py DELETED Viewed

@@ -1,271 +0,0 @@
-import requests
-from urllib.parse import urlparse, urljoin, ParseResult
-from bs4 import BeautifulSoup
-import re
-from tavily import TavilyClient
-import os
-tavily_client = None
-TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
-if TAVILY_API_KEY:
-    import logging
-    try:
-        tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
-    except Exception as e:
-        print(f"Failed to initialize Tavily client: {e}")
-def perform_web_search(query: str, max_results: int = 5, include_domains=None, exclude_domains=None) -> str:
-    """Perform web search using Tavily with default parameters"""
-    if not tavily_client:
-        return "Web search is not available. Please set the TAVILY_API_KEY environment variable."
-    try:
-        # Use Tavily defaults with advanced search depth for better results
-        search_params = {
-            "search_depth": "advanced",
-            "max_results": min(max(1, max_results), 20)
-        }
-        if include_domains is not None:
-            search_params["include_domains"] = include_domains
-        if exclude_domains is not None:
-            search_params["exclude_domains"] = exclude_domains
-        response = tavily_client.search(query, **search_params)
-        search_results = []
-        for result in response.get('results', []):
-            title = result.get('title', 'No title')
-            url = result.get('url', 'No URL')
-            content = result.get('content', 'No content')
-            search_results.append(f"Title: {title}\nURL: {url}\nContent: {content}\n")
-        if search_results:
-            return "Web Search Results:\n\n" + "\n---\n".join(search_results)
-        else:
-            return "No search results found."
-    except Exception as e:
-        return f"Search error: {str(e)}"
-def enhance_query_with_search(query: str, enable_search: bool) -> str:
-    """Enhance the query with web search results if search is enabled"""
-    if not enable_search or not tavily_client:
-        return query
-    # Perform search to get relevant information
-    search_results = perform_web_search(query)
-    # Combine original query with search results
-    enhanced_query = f"""Original Query: {query}
-{search_results}
-Please use the search results above to help create the requested application with the most up-to-date information and best practices."""
-    return enhanced_query
-def extract_website_content(url: str) -> str:
-    """Extract HTML code and content from a website URL"""
-    try:
-        # Validate URL
-        parsed_url = urlparse(url)
-        if not parsed_url.scheme:
-            url = "https://" + url
-            parsed_url = urlparse(url)
-        if not parsed_url.netloc:
-            return "Error: Invalid URL provided"
-        # Set comprehensive headers to mimic a real browser request
-        scheme = parsed_url.scheme
-        netloc = parsed_url.netloc
-        path = parsed_url.path if parsed_url.path else "/"
-        params = parsed_url.params
-        query = parsed_url.query
-        fragment = parsed_url.fragment
-        reconstructed_url = ParseResult(scheme, netloc, path, params, query, fragment).geturl()
-        logging.info(f"Extracting content from: {reconstructed_url}")
-        if reconstructed_url != url:
-            logging.info(f"Original URL: {url}")
-            logging.info(f"Reconstructed URL: {reconstructed_url}")
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'DNT': '1',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none',
-            'Sec-Fetch-User': '?1',
-            'Cache-Control': 'max-age=0'
-        }
-        # Create a session to maintain cookies and handle redirects
-        session = requests.Session()
-        session.headers.update(headers)
-        # Make the request with retry logic
-        max_retries = 3
-        for attempt in range(max_retries):
-            try:
-                response = session.get(url, timeout=15, allow_redirects=True)
-                response.raise_for_status()
-                break # Exit the loop if successful
-            except requests.exceptions.HTTPError as e:
-                if e.response.status_code == 403 and attempt < max_retries - 1:
-                    # Try with different User-Agent on 403
-                    session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
-                    continue
-                else:
-                    raise
-        # Get the raw HTML content with proper encoding
-        try:
-            # Try to get the content with automatic encoding detection
-            response.encoding = response.apparent_encoding
-            raw_html = response.text
-        except:
-            # Fallback to UTF-8 if encoding detection fails
-            raw_html = response.content.decode('utf-8', errors='ignore')
-        # Debug: Check if we got valid HTML
-        if not raw_html.strip().startswith('<!DOCTYPE') and not raw_html.strip().startswith('<html'):
-            print(f"Warning: Response doesn't look like HTML. First 200 chars: {raw_html[:200]}")
-            # Try alternative approaches
-            try:
-                raw_html = response.content.decode('latin-1', errors='ignore')
-            except:
-                try:
-                    raw_html = response.content.decode('utf-8', errors='ignore')
-                except:
-                    raw_html = response.content.decode('cp1252', errors='ignore')
-        # Parse HTML content for analysis
-        soup = BeautifulSoup(raw_html, 'html.parser')
-        # Check if this is a JavaScript-heavy site
-        script_tags = soup.find_all('script')
-        if len(script_tags) > 10:
-            print(f"Warning: This site has {len(script_tags)} script tags - it may be a JavaScript-heavy site")
-            # Attempt to use Playwright to render the page and get full HTML
-            try:
-                from playwright.sync_api import sync_playwright
-                with sync_playwright() as p:
-                    browser = p.chromium.launch()
-                    page = browser.new_page()
-                    page.goto(url, timeout=30000)
-                    page.wait_for_load_state("networkidle")
-                    rendered_html = page.content()
-                    browser.close()
-                    soup = BeautifulSoup(rendered_html, 'html.parser')
-            except Exception as e:
-                print(f"Playwright rendering failed: {e}")
-        # Extract title, meta description, etc.
-        title = soup.find('title')
-        title_text = title.get_text().strip() if title else "No title found"
-        meta_desc = soup.find('meta', attrs={'name': 'description'})
-        description = meta_desc.get('content', '') if meta_desc else ""
-        # Fix image URLs
-        for img in soup.find_all('img'):
-            src = img.get('src', '')
-            if src:
-                img['src'] = urljoin(url, src)
-        # Fix background images in style attributes
-        for element in soup.find_all(attrs={'style': True}):
-            style_attr = element.get('style', '')
-            bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
-            matches = re.findall(bg_pattern, style_attr, re.IGNORECASE)
-            for match in matches:
-                if not match.startswith(('http', '//', 'data:')):
-                    style_attr = style_attr.replace(match, urljoin(url, match))
-            element['style'] = style_attr
-        # Fix background images in <style> tags
-        for style in soup.find_all('style'):
-            if style.string:
-                style_content = style.string
-                bg_pattern = r'background-image:\s*url\(["\']?([^"\']+)["\']?\)'
-                matches = re.findall(bg_pattern, style_content, re.IGNORECASE)
-                for match in matches:
-                    if not match.startswith(('http', '//', 'data:')):
-                        style_content = style_content.replace(match, urljoin(url, match))
-                style.string = style_content
-        # Test a few image URLs to see if they're accessible
-        def test_image_url(img_url):
-            try:
-                test_response = requests.head(img_url, timeout=5, allow_redirects=True)
-                return test_response.status_code == 200
-            except:
-                return False
-        working_images = []
-        for img in soup.find_all('img')[:10]:
-            if test_image_url(img['src']):
-                working_images.append(img)
-        modified_html = str(soup)
-        cleaned_html = re.sub(r'<!--.*?-->', '', modified_html, flags=re.DOTALL)
-        cleaned_html = re.sub(r'\s+', ' ', cleaned_html)
-        cleaned_html = re.sub(r'>\s+<', '><', cleaned_html)
-        if len(cleaned_html) > 15000:
-            cleaned_html = cleaned_html[:15000] + "\n<!-- ... HTML truncated for length ... -->"
-        if len(cleaned_html.strip()) < 100:
-            website_content = f"""
-WEBSITE REDESIGN - EXTRACTION FAILED
-====================================
-URL: {url}
-Title: {title_text}
-ERROR: Could not extract meaningful HTML content from this website. This could be due to:
-1. The website uses heavy JavaScript to load content dynamically
-2. The website has anti-bot protection
-3. The website requires authentication
-FALLBACK APPROACH:
-Please create a modern, responsive website design for a {title_text.lower()} website."""
-            return website_content.strip()
-        website_content = f"""
-WEBSITE REDESIGN - ORIGINAL HTML CODE
-=====================================
-URL: {url}
-Title: {title_text}
-Description: {description}
-IMAGES FOUND (use these exact URLs in your redesign):
-{chr(10).join([f"• {img.get('alt', 'Image')} - {img.get('src')}" for img in working_images]) if working_images else "No working images found"}
-ORIGINAL HTML CODE (use this as the base for redesign):
-```html
-{cleaned_html}
-```
-REDESIGN INSTRUCTIONS:
-Please redesign this website with a modern, responsive layout while preserving all original content and using the original images."""
-        return website_content.strip()
-    except requests.exceptions.HTTPError as e:
-        if e.response.status_code == 403:
-            return f"Error: Website blocked access (403 Forbidden). This website may have anti-bot protection. Try a different website or provide a description of what you want to build instead."
-        elif e.response.status_code == 404:
-            return f"Error: Website not found (404). Please check the URL and try again."
-        elif e.response.status_code >= 500:
-            return f"Error: Website server error ({e.response.status_code}). Please try again later."
-        else:
-            return f"Error accessing website: HTTP {e.response.status_code} - {str(e)}"
-    except requests.exceptions.Timeout:
-        return "Error: Request timed out. The website may be slow or unavailable."
-    except requests.exceptions.ConnectionError:
-        return "Error: Could not connect to the website. Please check your internet connection and the URL."
-    except requests.exceptions.RequestException as e:
-        return f"Error accessing website: {str(e)}"
-    except Exception as e:
-        return f"Error extracting website content: {str(e)}"