mobenta commited on
Commit
7bf52b6
·
verified ·
1 Parent(s): 12863fc

Upload app (7).py

Browse files
Files changed (1) hide show
  1. app (7).py +65 -0
app (7).py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import os
4
+ from scraper import fetch_html_selenium, format_data, save_raw_data, save_formatted_data
5
+ from pagination_detector import detect_pagination_elements
6
+ from assets import PRICING
7
+ import google.generativeai as genai
8
+
9
+ # Access API keys from Hugging Face Secrets
10
+ openai_api_key = os.getenv('OPENAI_API_KEY')
11
+ google_api_key = os.getenv('GOOGLE_API_KEY')
12
+ groq_api_key = os.getenv('GROQ_API_KEY')
13
+
14
+ # Check if the keys are available
15
+ if not openai_api_key or not google_api_key or not groq_api_key:
16
+ st.error("API keys are missing! Please add them as secrets in Hugging Face.")
17
+
18
+ # Initialize Streamlit app
19
+ st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
20
+ st.title("Universal Web Scraper 🦑")
21
+
22
+ # Initialize session state variables if they don't exist
23
+ if 'results' not in st.session_state:
24
+ st.session_state['results'] = None
25
+ if 'perform_scrape' not in st.session_state:
26
+ st.session_state['perform_scrape'] = False
27
+
28
+ # Sidebar components
29
+ st.sidebar.title("Web Scraper Settings")
30
+ model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
31
+ url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
32
+
33
+ # Add toggle to show/hide tags field
34
+ show_tags = st.sidebar.checkbox("Enable Scraping", value=False)
35
+
36
+ # Conditionally show tags input based on the toggle
37
+ tags = []
38
+ if show_tags:
39
+ tags = st.sidebar.text_input("Enter Fields to Extract (comma-separated)").split(",")
40
+
41
+ st.sidebar.markdown("---")
42
+ # Add pagination toggle and input
43
+ use_pagination = st.sidebar.checkbox("Enable Pagination", value=False)
44
+ pagination_details = None
45
+ if use_pagination:
46
+ pagination_details = st.sidebar.text_input("Enter Pagination Details (optional)", help="Describe how to navigate through pages")
47
+
48
+ st.sidebar.markdown("---")
49
+
50
+ # Define the scraping function
51
+ def perform_scrape():
52
+ raw_html = fetch_html_selenium(url_input)
53
+ markdown = format_data(raw_html)
54
+ save_raw_data(markdown, "scraped_data")
55
+
56
+ if use_pagination:
57
+ pagination_data, _, _ = detect_pagination_elements(url_input, pagination_details, model_selection, markdown)
58
+ return pagination_data
59
+
60
+ return markdown
61
+
62
+ if st.sidebar.button("Scrape"):
63
+ with st.spinner("Scraping data..."):
64
+ result = perform_scrape()
65
+ st.write(result)