webscarper / app.py
mobenta's picture
Update app.py
84cec89 verified
raw
history blame
3.79 kB
import requests
import json
import gradio as gr
import logging
from bs4 import BeautifulSoup
# Configure logging for debugging purposes
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Function to search YouTube without using the API
def youtube_search(query, max_results=10):
# Create the YouTube search URL
search_url = f"https://www.youtube.com/results?search_query={query.replace(' ', '+')}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
logging.debug(f"Starting YouTube search for query: {query}")
try:
response = requests.get(search_url, headers=headers)
response.raise_for_status() # Raise an error for bad status codes
# Parse the HTML response using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Look for the initial JSON data block within the script tags
scripts = soup.find_all("script")
for script in scripts:
if 'var ytInitialData = ' in script.text:
json_text = script.text.split('var ytInitialData = ')[1].split("};")[0] + "}"
data = json.loads(json_text)
# Traverse through the JSON to find video entries
video_items = []
contents = data['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']
for content in contents:
video_entries = content.get('itemSectionRenderer', {}).get('contents', [])
for entry in video_entries:
video_renderer = entry.get('videoRenderer')
if video_renderer:
video_id = video_renderer.get('videoId', 'N/A')
video_title = video_renderer.get('title', {}).get('runs', [{}])[0].get('text', 'N/A')
video_url = f"https://www.youtube.com/watch?v={video_id}"
thumbnail_url = video_renderer.get('thumbnail', {}).get('thumbnails', [{}])[0].get('url', 'N/A')
video_items.append((thumbnail_url, video_title, video_url))
if len(video_items) >= max_results:
break
if video_items:
return video_items, "" # Return the list of video items and no error message
else:
logging.warning("No video entries found.")
return [], "No video entries found."
logging.warning("JSON data block not found in the page.")
return [], "Unable to find video data."
except requests.exceptions.RequestException as e:
error_message = f"Request error occurred: {e}"
logging.error(error_message)
return [], error_message
except json.JSONDecodeError as e:
error_message = f"JSON decoding error occurred: {e}"
logging.error(error_message)
return [], error_message
except Exception as e:
error_message = f"An unexpected error occurred: {e}"
logging.error(error_message)
return [], error_message
# Function to create a gallery for Gradio
def display_videos(query):
gallery_items, error_message = youtube_search(query)
if error_message:
return gr.update(value=[], label="Error: Unable to fetch videos"), error_message
# Display thumbnails and titles as gallery items
formatted_gallery = [(f"<img src='{item[0]}' width='250'/>", f"<a href='{item[2]}' target='_blank'>{item[1]}</a>") for item in gallery_items]
return formatted_gallery, ""
# Gradio interface
with gr