# app.py import gradio as gr from playwright.sync_api import sync_playwright, Error as PlaywrightError from bs4 import BeautifulSoup import urllib.parse import datetime import atexit import re # --- GLOBAL PLAYWRIGHT SETUP --- # Launch Playwright and a browser instance once when the app starts. # This is crucial for performance and state management. try: p = sync_playwright().start() # Using Firefox can sometimes be less prone to bot detection than Chromium. # headless=True is essential for running on a server like Hugging Face Spaces. browser = p.firefox.launch(headless=True, timeout=60000) print("โœ… Playwright browser launched successfully.") except Exception as e: print(f"โŒ Could not launch Playwright browser: {e}") # You might want to handle this more gracefully, but for a demo, exiting is fine. exit() # Ensure the browser is closed gracefully when the app exits. def cleanup(): print("๐Ÿงน Cleaning up: Closing Playwright browser...") browser.close() p.stop() atexit.register(cleanup) # --- Core Browser Logic (Powered by Playwright) --- class Tab: """Represents a single browser tab, now backed by a Playwright Page.""" def __init__(self, playwright_page): self.page = playwright_page # The actual Playwright page object self.title = "New Tab" self.url = "about:blank" self.parsed_text = "Welcome! Navigate to a URL or search to get started." self.links = [] # A list of {'text': str, 'url': str} def close(self): """Closes the underlying Playwright page.""" if not self.page.is_closed(): self.page.close() class RealBrowser: """Manages multiple tabs and browser-level state.""" def __init__(self): self.tabs = [] self.active_tab_index = -1 self.bookmarks = set() self.global_history = [] self.new_tab() # Start with one tab def _get_active_tab(self): if self.active_tab_index == -1 or self.active_tab_index >= len(self.tabs): return None return self.tabs[self.active_tab_index] def _fetch_and_parse(self, tab, url): """Uses Playwright to navigate and BeautifulSoup to parse.""" log = f"โ–ถ๏ธ Navigating to {url}..." try: # Navigate the page, waiting until the page is fully loaded. # wait_until='domcontentloaded' is a good balance of speed and completeness. tab.page.goto(url, wait_until='domcontentloaded', timeout=30000) # Update tab state with the final URL after any redirects tab.url = tab.page.url tab.title = tab.page.title() or "No Title" log += f"\nโœ… Arrived at: {tab.url}" log += f"\n๐Ÿ“„ Title: {tab.title}" # Get the fully-rendered HTML and parse it html_content = tab.page.content() soup = BeautifulSoup(html_content, 'lxml') # Extract and clean text for script in soup(["script", "style", "nav", "footer"]): script.extract() text = soup.get_text(separator='\n', strip=True) tab.parsed_text = text # Extract links tab.links = [] for link in soup.find_all('a', href=True): href = link['href'] absolute_url = urllib.parse.urljoin(tab.url, href) # Filter out useless links if absolute_url.startswith('http') and not re.match(r'javascript:|mailto:', absolute_url): link_text = link.get_text(strip=True) or "[No Link Text]" tab.links.append({'text': link_text, 'url': absolute_url}) log += f"\n๐Ÿ”— Found {len(tab.links)} links." except PlaywrightError as e: error_message = str(e) if "net::ERR" in error_message: error_message = "Network error: Could not resolve host or connect." elif "Timeout" in error_message: error_message = f"Timeout: The page took too long to load." tab.title = "Error" tab.url = url tab.parsed_text = f"โŒ Failed to load page.\n\nError: {error_message}" tab.links = [] log += f"\nโŒ {error_message}" return log def go(self, term_or_url): """Opens a URL or performs a search in the active tab.""" tab = self._get_active_tab() if not tab: return "No active tab." # Check if it's a URL or a search term parsed_url = urllib.parse.urlparse(term_or_url) if parsed_url.scheme and parsed_url.netloc: url = term_or_url else: url = f"https://duckduckgo.com/html/?q={urllib.parse.quote_plus(term_or_url)}" self.global_history.append((datetime.datetime.now(), url)) return self._fetch_and_parse(tab, url) def back(self): tab = self._get_active_tab() if tab and tab.page.can_go_back(): # Playwright's go_back is async-like, we need to re-parse tab.page.go_back(wait_until='domcontentloaded') return self._fetch_and_parse(tab, tab.page.url) return "Cannot go back." def forward(self): tab = self._get_active_tab() if tab and tab.page.can_go_forward(): tab.page.go_forward(wait_until='domcontentloaded') return self._fetch_and_parse(tab, tab.page.url) return "Cannot go forward." def refresh(self): tab = self._get_active_tab() if tab: tab.page.reload(wait_until='domcontentloaded') return self._fetch_and_parse(tab, tab.page.url) return "No active tab." def new_tab(self): # Create a new page in the persistent browser context page = browser.new_page() tab = Tab(page) self.tabs.append(tab) self.active_tab_index = len(self.tabs) - 1 return self.go("https://duckduckgo.com/html/?q=news") # Navigate new tab to a default search def close_tab(self): if len(self.tabs) <= 1: return "Cannot close the last tab." tab_to_close = self.tabs.pop(self.active_tab_index) tab_to_close.close() if self.active_tab_index >= len(self.tabs): self.active_tab_index = len(self.tabs) - 1 # No need to re-fetch, just update the UI state return f"Tab closed. Switched to Tab {self.active_tab_index}." def switch_tab(self, tab_label): try: index = int(tab_label.split(":")[0].replace("Tab", "").strip()) if 0 <= index < len(self.tabs): self.active_tab_index = index return f"Switched to Tab {index}." return "Invalid tab index." except (ValueError, IndexError): return "Invalid tab format." # --- Gradio UI and Event Handlers --- def update_ui_components(browser_state: RealBrowser): """Generates all UI component values from the browser state.""" active_tab = browser_state._get_active_tab() if not active_tab: return { page_content: gr.Markdown("No active tabs. Please create a new one."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[], label="Active Tabs"), } # Tab Selector tab_choices = [f"Tab {i}: {tab.title[:40]}..." for i, tab in enumerate(browser_state.tabs)] active_tab_label = f"Tab {browser_state.active_tab_index}: {active_tab.title[:40]}..." # Links Display links_md = "### ๐Ÿ”— Links on Page\n" if active_tab.links: for i, link in enumerate(active_tab.links[:25]): # Show first 25 links links_md += f"{i}. [{link['text'][:80]}]({link['url']})\n" else: links_md += "_No links found or page failed to load._" return { page_content: gr.Markdown(f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text[:2000]}..."), url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md), tab_selector: gr.Radio(choices=tab_choices, value=active_tab_label, label="Active Tabs"), } # --- Event Handlers --- def handle_action(browser_state, action, value=None): if action == "go": log = browser_state.go(value) elif action == "click": tab = browser_state._get_active_tab() try: link_index = int(value) if tab and 0 <= link_index < len(tab.links): link_url = tab.links[link_index]['url'] log = browser_state.go(link_url) else: log = "Invalid link number." except (ValueError, TypeError): log = "Please enter a valid number to click." elif action == "back": log = browser_state.back() elif action == "forward": log = browser_state.forward() elif action == "refresh": log = browser_state.refresh() elif action == "new_tab": log = browser_state.new_tab() elif action == "close_tab": log = browser_state.close_tab() elif action == "switch_tab": log = browser_state.switch_tab(value) else: log = "Unknown action." # After any action, update the entire UI based on the new state return { **update_ui_components(browser_state), log_display: gr.Textbox(log) } # --- Gradio Interface Layout --- with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo: # The gr.State holds our Python class instance, persisting it across calls. browser_state = gr.State(RealBrowser()) gr.Markdown("# ๐ŸŒ Real Browser Demo (Powered by Playwright)") gr.Markdown("Type a URL or search term. This demo runs a real headless browser on the server to fetch and parse live websites.") with gr.Row(): with gr.Column(scale=3): with gr.Row(): back_btn = gr.Button("โ—€ Back") forward_btn = gr.Button("โ–ถ Forward") refresh_btn = gr.Button("๐Ÿ”„ Refresh") url_textbox = gr.Textbox(label="URL or Search Term", placeholder="https://news.ycombinator.com or 'best python libraries'", interactive=True) go_btn = gr.Button("Go", variant="primary") with gr.Accordion("Page Content (Text Only)", open=True): page_content = gr.Markdown("Loading...") log_display = gr.Textbox(label="Status Log", interactive=False) with gr.Column(scale=1): with gr.Row(): new_tab_btn = gr.Button("โž• New Tab") close_tab_btn = gr.Button("โŒ Close Tab") tab_selector = gr.Radio(choices=[], label="Active Tabs", interactive=True) with gr.Accordion("Clickable Links", open=True): links_display = gr.Markdown("...") with gr.Row(): click_num_box = gr.Number(label="Link #", scale=1, minimum=0, step=1) click_btn = gr.Button("Click Link", scale=2) # --- Component Wiring --- all_outputs = [page_content, url_textbox, links_display, tab_selector, log_display] # Initial load demo.load( lambda state: {**update_ui_components(state), log_display: "๐Ÿš€ Browser Initialized! Ready to navigate."}, inputs=[browser_state], outputs=all_outputs ) # Event listeners go_btn.click(lambda s, v: handle_action(s, "go", v), [browser_state, url_textbox], all_outputs, show_progress="full") url_textbox.submit(lambda s, v: handle_action(s, "go", v), [browser_state, url_textbox], all_outputs, show_progress="full") click_btn.click(lambda s, v: handle_action(s, "click", v), [browser_state, click_num_box], all_outputs, show_progress="full") back_btn.click(lambda s: handle_action(s, "back"), [browser_state], all_outputs, show_progress="full") forward_btn.click(lambda s: handle_action(s, "forward"), [browser_state], all_outputs, show_progress="full") refresh_btn.click(lambda s: handle_action(s, "refresh"), [browser_state], all_outputs, show_progress="full") new_tab_btn.click(lambda s: handle_action(s, "new_tab"), [browser_state], all_outputs, show_progress="full") close_tab_btn.click(lambda s: handle_action(s, "close_tab"), [browser_state], all_outputs) tab_selector.input(lambda s, v: handle_action(s, "switch_tab", v), [browser_state, tab_selector], all_outputs) demo.launch()