Spaces:

mobenta
/

webscarper

Running

App Files Files Community

mobenta commited on Oct 2, 2024

Commit

7bf52b6

verified ·

1 Parent(s): 12863fc

Upload app (7).py

Browse files

Files changed (1) hide show

app (7).py +65 -0

app (7).py ADDED Viewed

	@@ -0,0 +1,65 @@

+import streamlit as st
+import os
+from scraper import fetch_html_selenium, format_data, save_raw_data, save_formatted_data
+from pagination_detector import detect_pagination_elements
+from assets import PRICING
+import google.generativeai as genai
+# Access API keys from Hugging Face Secrets
+openai_api_key = os.getenv('OPENAI_API_KEY')
+google_api_key = os.getenv('GOOGLE_API_KEY')
+groq_api_key = os.getenv('GROQ_API_KEY')
+# Check if the keys are available
+if not openai_api_key or not google_api_key or not groq_api_key:
+    st.error("API keys are missing! Please add them as secrets in Hugging Face.")
+# Initialize Streamlit app
+st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
+st.title("Universal Web Scraper 🦑")
+# Initialize session state variables if they don't exist
+if 'results' not in st.session_state:
+    st.session_state['results'] = None
+if 'perform_scrape' not in st.session_state:
+    st.session_state['perform_scrape'] = False
+# Sidebar components
+st.sidebar.title("Web Scraper Settings")
+model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
+url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
+# Add toggle to show/hide tags field
+show_tags = st.sidebar.checkbox("Enable Scraping", value=False)
+# Conditionally show tags input based on the toggle
+tags = []
+if show_tags:
+    tags = st.sidebar.text_input("Enter Fields to Extract (comma-separated)").split(",")
+st.sidebar.markdown("---")
+# Add pagination toggle and input
+use_pagination = st.sidebar.checkbox("Enable Pagination", value=False)
+pagination_details = None
+if use_pagination:
+    pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)", help="Describe how to navigate through pages")
+st.sidebar.markdown("---")
+# Define the scraping function
+def perform_scrape():
+    raw_html = fetch_html_selenium(url_input)
+    markdown = format_data(raw_html)
+    save_raw_data(markdown, "scraped_data")
+    if use_pagination:
+        pagination_data, _, _ = detect_pagination_elements(url_input, pagination_details, model_selection, markdown)
+        return pagination_data
+    return markdown
+if st.sidebar.button("Scrape"):
+    with st.spinner("Scraping data..."):
+        result = perform_scrape()
+        st.write(result)