import streamlit as st import os from scraper import fetch_html_selenium, format_data, save_raw_data, save_formatted_data from pagination_detector import detect_pagination_elements from assets import PRICING import google.generativeai as genai # Access API keys from Hugging Face Secrets openai_api_key = os.getenv('OPENAI_API_KEY') google_api_key = os.getenv('GOOGLE_API_KEY') groq_api_key = os.getenv('GROQ_API_KEY') # Check if the keys are available if not openai_api_key or not google_api_key or not groq_api_key: st.error("API keys are missing! Please add them as secrets in Hugging Face.") # Initialize Streamlit app st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑") st.title("Universal Web Scraper 🦑") # Initialize session state variables if they don't exist if 'results' not in st.session_state: st.session_state['results'] = None if 'perform_scrape' not in st.session_state: st.session_state['perform_scrape'] = False # Sidebar components st.sidebar.title("Web Scraper Settings") model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0) url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace") # Add toggle to show/hide tags field show_tags = st.sidebar.checkbox("Enable Scraping", value=False) # Conditionally show tags input based on the toggle tags = [] if show_tags: tags = st.sidebar.text_input("Enter Fields to Extract (comma-separated)").split(",") st.sidebar.markdown("---") # Add pagination toggle and input use_pagination = st.sidebar.checkbox("Enable Pagination", value=False) pagination_details = None if use_pagination: pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)", help="Describe how to navigate through pages") st.sidebar.markdown("---") # Define the scraping function def perform_scrape(): raw_html = fetch_html_selenium(url_input) markdown = format_data(raw_html) save_raw_data(markdown, "scraped_data") if use_pagination: pagination_data, _, _ = detect_pagination_elements(url_input, pagination_details, model_selection, markdown) return pagination_data return markdown if st.sidebar.button("Scrape"): with st.spinner("Scraping data..."): result = perform_scrape() st.write(result)