webscarper / app.py
mobenta's picture
Rename app (7).py to app.py
c31f7c9 verified
raw
history blame
2.32 kB
import streamlit as st
import os
from scraper import fetch_html_selenium, format_data, save_raw_data, save_formatted_data
from pagination_detector import detect_pagination_elements
from assets import PRICING
import google.generativeai as genai
# Access API keys from Hugging Face Secrets
openai_api_key = os.getenv('OPENAI_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')
# Check if the keys are available
if not openai_api_key or not google_api_key or not groq_api_key:
st.error("API keys are missing! Please add them as secrets in Hugging Face.")
# Initialize Streamlit app
st.set_page_config(page_title="Universal Web Scraper", page_icon="πŸ¦‘")
st.title("Universal Web Scraper πŸ¦‘")
# Initialize session state variables if they don't exist
if 'results' not in st.session_state:
st.session_state['results'] = None
if 'perform_scrape' not in st.session_state:
st.session_state['perform_scrape'] = False
# Sidebar components
st.sidebar.title("Web Scraper Settings")
model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
# Add toggle to show/hide tags field
show_tags = st.sidebar.checkbox("Enable Scraping", value=False)
# Conditionally show tags input based on the toggle
tags = []
if show_tags:
tags = st.sidebar.text_input("Enter Fields to Extract (comma-separated)").split(",")
st.sidebar.markdown("---")
# Add pagination toggle and input
use_pagination = st.sidebar.checkbox("Enable Pagination", value=False)
pagination_details = None
if use_pagination:
pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)", help="Describe how to navigate through pages")
st.sidebar.markdown("---")
# Define the scraping function
def perform_scrape():
raw_html = fetch_html_selenium(url_input)
markdown = format_data(raw_html)
save_raw_data(markdown, "scraped_data")
if use_pagination:
pagination_data, _, _ = detect_pagination_elements(url_input, pagination_details, model_selection, markdown)
return pagination_data
return markdown
if st.sidebar.button("Scrape"):
with st.spinner("Scraping data..."):
result = perform_scrape()
st.write(result)