Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Running

App Files Files Community

CodeCompetitionClaudeVsGPT / backup6.app.py

awacke1

Rename app.py to backup6.app.py

8bcaf1e verified 10 months ago

raw

history blame

25.3 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import torch
	import json
	import os
	import glob
	from pathlib import Path
	from datetime import datetime
	import edge_tts
	import asyncio
	import requests
	from collections import defaultdict
	from audio_recorder_streamlit import audio_recorder
	import streamlit.components.v1 as components
	from urllib.parse import quote
	from xml.etree import ElementTree as ET
	from datasets import load_dataset

	# 🧠 Initialize session state variables
	SESSION_VARS = {
	'search_history': [], # Track search history
	'last_voice_input': "", # Last voice input
	'transcript_history': [], # Conversation history
	'should_rerun': False, # Trigger for UI updates
	'search_columns': [], # Available search columns
	'initial_search_done': False, # First search flag
	'tts_voice': "en-US-AriaNeural", # Default voice
	'arxiv_last_query': "", # Last ArXiv search
	'dataset_loaded': False, # Dataset load status
	'current_page': 0, # Current data page
	'data_cache': None, # Data cache
	'dataset_info': None # Dataset metadata
	}

	# Constants
	ROWS_PER_PAGE = 100

	# Initialize session state
	for var, default in SESSION_VARS.items():
	if var not in st.session_state:
	st.session_state[var] = default

	@st.cache_resource
	def get_model():
	return SentenceTransformer('all-MiniLM-L6-v2')

	@st.cache_data
	def load_dataset_page(dataset_id, token, page, rows_per_page):
	try:
	start_idx = page * rows_per_page
	end_idx = start_idx + rows_per_page
	dataset = load_dataset(
	dataset_id,
	token=token,
	streaming=False,
	split=f'train[{start_idx}:{end_idx}]'
	)
	return pd.DataFrame(dataset)
	except Exception as e:
	st.error(f"Error loading page {page}: {str(e)}")
	return pd.DataFrame()

	@st.cache_data
	def get_dataset_info(dataset_id, token):
	try:
	dataset = load_dataset(dataset_id, token=token, streaming=True)
	return dataset['train'].info
	except Exception as e:
	st.error(f"Error loading dataset info: {str(e)}")
	return None

	def fetch_dataset_info(dataset_id):
	info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
	try:
	response = requests.get(info_url, timeout=30)
	if response.status_code == 200:
	return response.json()
	except Exception as e:
	st.warning(f"Error fetching dataset info: {e}")
	return None

	def fetch_dataset_rows(dataset_id, config="default", split="train", max_rows=100):
	url = f"https://datasets-server.huggingface.co/first-rows?dataset={dataset_id}&config={config}&split={split}"
	try:
	response = requests.get(url, timeout=30)
	if response.status_code == 200:
	data = response.json()
	if 'rows' in data:
	processed_rows = []
	for row_data in data['rows']:
	row = row_data.get('row', row_data)
	# Process embeddings if present
	for key in row:
	if any(term in key.lower() for term in ['embed', 'vector', 'encoding']):
	if isinstance(row[key], str):
	try:
	row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()]
	except:
	continue
	row['_config'] = config
	row['_split'] = split
	processed_rows.append(row)
	return processed_rows
	except Exception as e:
	st.warning(f"Error fetching rows: {e}")
	return []

	class FastDatasetSearcher:
	def __init__(self, dataset_id="tomg-group-umd/cinepile"):
	self.dataset_id = dataset_id
	self.text_model = get_model()
	self.token = os.environ.get('DATASET_KEY')
	if not self.token:
	st.error("Please set the DATASET_KEY environment variable")
	st.stop()

	if st.session_state['dataset_info'] is None:
	st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)

	def load_page(self, page=0):
	return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)

	def quick_search(self, query, df):
	"""Enhanced search with strict token matching and semantic relevance"""
	if df.empty or not query.strip():
	return df

	try:
	# Define stricter thresholds
	MIN_SEMANTIC_SCORE = 0.5 # Higher semantic threshold
	EXACT_MATCH_BOOST = 2.0 # Boost for exact matches

	# Get searchable columns
	searchable_cols = []
	for col in df.columns:
	sample_val = df[col].iloc[0]
	if not isinstance(sample_val, (np.ndarray, bytes)):
	searchable_cols.append(col)

	query_lower = query.lower()
	query_terms = set(query_lower.split())
	query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]

	scores = []
	matched_any = []

	for _, row in df.iterrows():
	text_parts = []
	row_matched = False
	exact_match = False

	# Prioritize description and matched_text fields
	priority_fields = ['description', 'matched_text']
	other_fields = [col for col in searchable_cols if col not in priority_fields]

	# First check priority fields for exact matches
	for col in priority_fields:
	if col in row:
	val = row[col]
	if val is not None:
	val_str = str(val).lower()
	# Check for exact token matches
	if query_lower in val_str.split():
	exact_match = True
	if any(term in val_str.split() for term in query_terms):
	row_matched = True
	text_parts.append(str(val))

	# Then check other fields
	for col in other_fields:
	val = row[col]
	if val is not None:
	val_str = str(val).lower()
	if query_lower in val_str.split():
	exact_match = True
	if any(term in val_str.split() for term in query_terms):
	row_matched = True
	text_parts.append(str(val))

	text = ' '.join(text_parts)

	if text.strip():
	# Calculate exact token matches
	text_tokens = set(text.lower().split())
	matching_terms = query_terms.intersection(text_tokens)
	keyword_score = len(matching_terms) / len(query_terms)

	# Calculate semantic score
	text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
	semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])

	# Weighted scoring with priority for exact matches
	combined_score = 0.8 * keyword_score + 0.2 * semantic_score

	if exact_match:
	combined_score *= EXACT_MATCH_BOOST
	elif row_matched:
	combined_score *= 1.2
	else:
	combined_score = 0.0
	row_matched = False

	scores.append(combined_score)
	matched_any.append(row_matched)

	results_df = df.copy()
	results_df['score'] = scores
	results_df['matched'] = matched_any

	# Filter relevant results
	filtered_df = results_df[
	(results_df['matched']) \| # Include direct matches
	(results_df['score'] > MIN_KEYWORD_MATCHES) # Or high relevance
	]

	return filtered_df.sort_values('score', ascending=False)

	except Exception as e:
	st.error(f"Search error: {str(e)}")
	return df

	class VideoSearch:
	def __init__(self):
	self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
	self.dataset_id = "omegalabsinc/omega-multimodal"
	self.load_dataset()

	def fetch_dataset_rows(self):
	try:
	df, configs, splits = search_dataset(
	self.dataset_id,
	"",
	include_configs=None,
	include_splits=None
	)

	if not df.empty:
	st.session_state['search_columns'] = [col for col in df.columns
	if col not in ['video_embed', 'description_embed', 'audio_embed']
	and not col.startswith('_')]
	return df

	return self.load_example_data()

	except Exception as e:
	st.warning(f"Error loading videos: {e}")
	return self.load_example_data()

	def load_example_data(self):
	example_data = [{
	"video_id": "sample-123",
	"youtube_id": "dQw4w9WgXcQ",
	"description": "An example video",
	"views": 12345,
	"start_time": 0,
	"end_time": 60
	}]
	return pd.DataFrame(example_data)

	def load_dataset(self):
	self.dataset = self.fetch_dataset_rows()
	self.prepare_features()

	def prepare_features(self):
	try:
	embed_cols = [col for col in self.dataset.columns
	if any(term in col.lower() for term in ['embed', 'vector', 'encoding'])]

	embeddings = {}
	for col in embed_cols:
	try:
	data = []
	for row in self.dataset[col]:
	if isinstance(row, str):
	values = [float(x.strip()) for x in row.strip('[]').split(',') if x.strip()]
	elif isinstance(row, list):
	values = row
	else:
	continue
	data.append(values)

	if data:
	embeddings[col] = np.array(data)
	except:
	continue

	self.video_embeds = embeddings.get('video_embed', next(iter(embeddings.values())) if embeddings else None)
	self.text_embeds = embeddings.get('description_embed', self.video_embeds)

	except:
	num_rows = len(self.dataset)
	self.video_embeds = np.random.randn(num_rows, 384)
	self.text_embeds = np.random.randn(num_rows, 384)

	def search(self, query, column=None, top_k=20):
	"""Enhanced search with better relevance scoring"""
	MIN_RELEVANCE = 0.3 # Minimum relevance threshold

	query_embedding = self.text_model.encode([query])[0]
	video_sims = cosine_similarity([query_embedding], self.video_embeds)[0]
	text_sims = cosine_similarity([query_embedding], self.text_embeds)[0]
	combined_sims = 0.7 * text_sims + 0.3 * video_sims # Favor text matches

	if column and column in self.dataset.columns and column != "All Fields":
	# Direct matches in specified column
	matches = self.dataset[column].astype(str).str.contains(query, case=False)
	combined_sims[matches] *= 1.5 # Boost exact matches

	# Filter by minimum relevance
	relevant_indices = np.where(combined_sims >= MIN_RELEVANCE)[0]
	if len(relevant_indices) == 0:
	return []

	top_k = min(top_k, len(relevant_indices))
	top_indices = relevant_indices[np.argsort(combined_sims[relevant_indices])[-top_k:][::-1]]

	results = []
	for idx in top_indices:
	result = {'relevance_score': float(combined_sims[idx])}
	for col in self.dataset.columns:
	if col not in ['video_embed', 'description_embed', 'audio_embed']:
	result[col] = self.dataset.iloc[idx][col]
	results.append(result)

	return results

	def search_dataset(dataset_id, search_text, include_configs=None, include_splits=None):
	dataset_info = fetch_dataset_info(dataset_id)
	if not dataset_info:
	return pd.DataFrame(), [], []

	configs = include_configs if include_configs else dataset_info.get('config_names', ['default'])
	all_rows = []
	available_splits = set()

	for config in configs:
	try:
	splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
	splits_response = requests.get(splits_url, timeout=30)
	if splits_response.status_code == 200:
	splits_data = splits_response.json()
	splits = [split['split'] for split in splits_data.get('splits', [])]
	if not splits:
	splits = ['train']

	if include_splits:
	splits = [s for s in splits if s in include_splits]

	available_splits.update(splits)

	for split in splits:
	rows = fetch_dataset_rows(dataset_id, config, split)
	for row in rows:
	text_content = ' '.join(str(v) for v in row.values()
	if isinstance(v, (str, int, float)))
	if search_text.lower() in text_content.lower():
	row['_matched_text'] = text_content
	row['_relevance_score'] = text_content.lower().count(search_text.lower())
	all_rows.append(row)
	except Exception as e:
	st.warning(f"Error processing config {config}: {e}")
	continue

	if all_rows:
	df = pd.DataFrame(all_rows)
	df = df.sort_values('_relevance_score', ascending=False)
	return df, configs, list(available_splits)

	return pd.DataFrame(), configs, list(available_splits)

	@st.cache_resource
	def get_speech_model():
	return edge_tts.Communicate

	async def generate_speech(text, voice=None):
	if not text.strip():
	return None
	if not voice:
	voice = st.session_state['tts_voice']
	try:
	communicate = get_speech_model()(text, voice)
	audio_file = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
	await communicate.save(audio_file)
	return audio_file
	except Exception as e:
	st.error(f"Error generating speech: {e}")
	return None

	def transcribe_audio(audio_path):
	"""Placeholder for ASR implementation"""
	return "ASR not implemented. Add your preferred speech recognition here!"

	def arxiv_search(query, max_results=5):
	base_url = "http://export.arxiv.org/api/query?"
	search_url = base_url + f"search_query={quote(query)}&start=0&max_results={max_results}"
	try:
	r = requests.get(search_url)
	if r.status_code == 200:
	root = ET.fromstring(r.text)
	ns = {'atom': 'http://www.w3.org/2005/Atom'}
	entries = root.findall('atom:entry', ns)
	results = []
	for entry in entries:
	title = entry.find('atom:title', ns).text.strip()
	summary = entry.find('atom:summary', ns).text.strip()
	link = next((l.get('href') for l in entry.findall('atom:link', ns)
	if l.get('type') == 'text/html'), None)
	results.append((title, summary, link))
	return results
	except Exception as e:
	st.error(f"ArXiv search error: {e}")
	return []

	def show_file_manager():
	st.subheader("📂 File Manager")
	col1, col2 = st.columns(2)

	with col1:
	uploaded_file = st.file_uploader("Upload File", type=['txt', 'md', 'mp3'])
	if uploaded_file:
	with open(uploaded_file.name, "wb") as f:
	f.write(uploaded_file.getvalue())
	st.success(f"Uploaded: {uploaded_file.name}")
	st.experimental_rerun()

	with col2:
	if st.button("🗑 Clear Files"):
	for f in glob.glob(".txt") + glob.glob(".md") + glob.glob("*.mp3"):
	os.remove(f)
	st.success("All files cleared!")
	st.experimental_rerun()

	files = glob.glob(".txt") + glob.glob(".md") + glob.glob("*.mp3")
	if files:
	st.write("### Existing Files")
	for f in files:
	with st.expander(f"📄 {os.path.basename(f)}"):
	if f.endswith('.mp3'):
	st.audio(f)
	else:
	with open(f, 'r', encoding='utf-8') as file:
	st.text_area("Content", file.read(), height=100)
	if st.button(f"Delete {os.path.basename(f)}", key=f"del_{f}"):
	os.remove(f)
	st.experimental_rerun()

	def perform_arxiv_lookup(query, vocal_summary=True, titles_summary=True, full_audio=False):
	results = arxiv_search(query, max_results=5)
	if not results:
	st.write("No results found.")
	return

	st.markdown(f"ArXiv Results for '{query}':")
	for i, (title, summary, link) in enumerate(results, start=1):
	st.markdown(f"{i}. {title}")
	st.write(summary)
	if link:
	st.markdown(f"[View Paper]({link})")

	if vocal_summary:
	spoken_text = f"Here are ArXiv results for {query}. "
	if titles_summary:
	spoken_text += " Titles: " + ", ".join([res[0] for res in results])
	else:
	spoken_text += " " + results[0][1][:200]

	audio_file = asyncio.run(generate_speech(spoken_text))
	if audio_file:
	st.audio(audio_file)

	if full_audio:
	full_text = ""
	for i, (title, summary, _) in enumerate(results, start=1):
	full_text += f"Result {i}: {title}. {summary} "
	audio_file_full = asyncio.run(generate_speech(full_text))
	if audio_file_full:
	st.write("### Full Audio Summary")
	st.audio(audio_file_full)

	def render_result(result):
	"""Render a search result with voice selection and TTS options"""
	score = result.get('relevance_score', 0)
	result_filtered = {k: v for k, v in result.items()
	if k not in ['relevance_score', 'video_embed', 'description_embed', 'audio_embed']}

	if 'youtube_id' in result:
	st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result.get('start_time', 0)}")

	cols = st.columns([2, 1])
	with cols[0]:
	text_content = [] # Collect text for TTS
	for key, value in result_filtered.items():
	if isinstance(value, (str, int, float)):
	st.write(f"{key}: {value}")
	if isinstance(value, str) and len(value.strip()) > 0:
	text_content.append(f"{key}: {value}")

	with cols[1]:
	st.metric("Relevance Score", f"{score:.2%}")

	# Voice selection for TTS
	voices = {
	"Aria (US Female)": "en-US-AriaNeural",
	"Guy (US Male)": "en-US-GuyNeural",
	"Sonia (UK Female)": "en-GB-SoniaNeural",
	"Tony (UK Male)": "en-GB-TonyNeural",
	"Jenny (US Female)": "en-US-JennyNeural"
	}

	selected_voice = st.selectbox(
	"Select Voice",
	list(voices.keys()),
	key=f"voice_{result.get('video_id', '')}"
	)

	if st.button("🔊 Read Description", key=f"read_{result.get('video_id', '')}"):
	text_to_read = ". ".join(text_content)
	audio_file = asyncio.run(generate_speech(text_to_read, voices[selected_voice]))
	if audio_file:
	st.audio(audio_file)

	def main():
	st.title("🎥 Advanced Video & Dataset Search with Voice")

	# Initialize search
	search = VideoSearch()

	# Create tabs
	tab1, tab2, tab3, tab4 = st.tabs([
	"🔍 Search", "🎙️ Voice Input", "📚 ArXiv", "📂 Files"
	])

	# Search Tab
	with tab1:
	st.subheader("Search Videos")
	col1, col2 = st.columns([3, 1])
	with col1:
	query = st.text_input("Enter search query:",
	value="" if st.session_state['initial_search_done'] else "aliens")
	with col2:
	search_column = st.selectbox("Search in:",
	["All Fields"] + st.session_state['search_columns'])

	col3, col4 = st.columns(2)
	with col3:
	num_results = st.slider("Max results:", 1, 100, 20)
	with col4:
	search_button = st.button("🔍 Search")

	if (search_button or not st.session_state['initial_search_done']) and query:
	st.session_state['initial_search_done'] = True
	selected_column = None if search_column == "All Fields" else search_column

	with st.spinner("Searching..."):
	results = search.search(query, selected_column, num_results)

	if results:
	st.session_state['search_history'].append({
	'query': query,
	'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	'results': results[:5]
	})

	st.write(f"Found {len(results)} results:")
	for i, result in enumerate(results, 1):
	with st.expander(f"Result {i}", expanded=(i==1)):
	render_result(result)
	else:
	st.warning("No matching results found.")

	# Voice Input Tab
	with tab2:
	st.subheader("Voice Search")
	st.write("🎙️ Record your query:")
	audio_bytes = audio_recorder()
	if audio_bytes:
	with st.spinner("Processing audio..."):
	audio_path = f"temp_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
	with open(audio_path, "wb") as f:
	f.write(audio_bytes)

	voice_query = transcribe_audio(audio_path)
	st.markdown("Transcribed Text:")
	st.write(voice_query)
	st.session_state['last_voice_input'] = voice_query

	if st.button("🔍 Search from Voice"):
	results = search.search(voice_query, None, 20)
	for i, result in enumerate(results, 1):
	with st.expander(f"Result {i}", expanded=(i==1)):
	render_result(result)

	if os.path.exists(audio_path):
	os.remove(audio_path)

	# ArXiv Tab
	with tab3:
	st.subheader("ArXiv Search")
	arxiv_query = st.text_input("Search ArXiv:", value=st.session_state['arxiv_last_query'])
	vocal_summary = st.checkbox("🎙 Quick Audio Summary", value=True)
	titles_summary = st.checkbox("🔖 Titles Only", value=True)
	full_audio = st.checkbox("📚 Full Audio Summary", value=False)

	if st.button("🔍 Search ArXiv"):
	st.session_state['arxiv_last_query'] = arxiv_query
	perform_arxiv_lookup(arxiv_query, vocal_summary, titles_summary, full_audio)

	# File Manager Tab
	with tab4:
	show_file_manager()

	# Sidebar
	with st.sidebar:
	st.subheader("⚙️ Settings & History")
	if st.button("🗑️ Clear History"):
	st.session_state['search_history'] = []
	st.experimental_rerun()

	st.markdown("### Recent Searches")
	for entry in reversed(st.session_state['search_history'][-5:]):
	with st.expander(f"{entry['timestamp']}: {entry['query']}"):
	for i, result in enumerate(entry['results'], 1):
	st.write(f"{i}. {result.get('description', '')[:100]}...")

	st.markdown("### Voice Settings")
	st.selectbox("TTS Voice:", [
	"en-US-AriaNeural",
	"en-US-GuyNeural",
	"en-GB-SoniaNeural"
	], key="tts_voice")

	if __name__ == "__main__":
	main()