File size: 2,315 Bytes
7bf52b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

import streamlit as st
import os
from scraper import fetch_html_selenium, format_data, save_raw_data, save_formatted_data
from pagination_detector import detect_pagination_elements
from assets import PRICING
import google.generativeai as genai

# Access API keys from Hugging Face Secrets
openai_api_key = os.getenv('OPENAI_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

# Check if the keys are available
if not openai_api_key or not google_api_key or not groq_api_key:
    st.error("API keys are missing! Please add them as secrets in Hugging Face.")

# Initialize Streamlit app
st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
st.title("Universal Web Scraper 🦑")

# Initialize session state variables if they don't exist
if 'results' not in st.session_state:
    st.session_state['results'] = None
if 'perform_scrape' not in st.session_state:
    st.session_state['perform_scrape'] = False

# Sidebar components
st.sidebar.title("Web Scraper Settings")
model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")

# Add toggle to show/hide tags field
show_tags = st.sidebar.checkbox("Enable Scraping", value=False)

# Conditionally show tags input based on the toggle
tags = []
if show_tags:
    tags = st.sidebar.text_input("Enter Fields to Extract (comma-separated)").split(",")

st.sidebar.markdown("---")
# Add pagination toggle and input
use_pagination = st.sidebar.checkbox("Enable Pagination", value=False)
pagination_details = None
if use_pagination:
    pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)", help="Describe how to navigate through pages")

st.sidebar.markdown("---")

# Define the scraping function
def perform_scrape():
    raw_html = fetch_html_selenium(url_input)
    markdown = format_data(raw_html)
    save_raw_data(markdown, "scraped_data")

    if use_pagination:
        pagination_data, _, _ = detect_pagination_elements(url_input, pagination_details, model_selection, markdown)
        return pagination_data

    return markdown

if st.sidebar.button("Scrape"):
    with st.spinner("Scraping data..."):
        result = perform_scrape()
        st.write(result)