Spaces:

Agents-MCP-Hackathon
/

doc-mcp

Running

App Files Files Community

doc-mcp / app.py

mdabidhussain

added tools for github file listing and retrieval

3fcc667 3 months ago

raw

history blame

65.6 kB

	import asyncio
	import os
	import time
	import traceback
	from typing import Dict, List

	import gradio as gr
	from dotenv import load_dotenv
	from llama_index.core import Settings
	from llama_index.core.text_splitter import SentenceSplitter

	from rag.config import (delete_repository_data, embed_model,
	get_available_repos, get_repo_details,
	get_repository_stats, llm)
	from rag.github_file_loader import \
	fetch_markdown_files as fetch_files_with_loader
	from rag.github_file_loader import fetch_repository_files, load_github_files
	from rag.ingest import ingest_documents_async
	from rag.query import QueryRetriever

	load_dotenv()

	Settings.llm = llm
	Settings.embed_model = embed_model
	Settings.node_parser = SentenceSplitter(chunk_size=3072)


	def get_available_repositories():
	return get_available_repos()


	def start_file_loading(
	repo_url: str, selected_files: List[str], current_progress: Dict
	):
	"""Step 1: Load files from GitHub"""
	print("\n🔄 STARTING FILE LOADING STEP")
	print(f"📍 Repository: {repo_url}")
	print(f"📋 Selected files: {selected_files}")

	if not selected_files:
	return {
	"status": "error",
	"message": "❌ No files selected for loading",
	"progress": 0,
	"details": "",
	"step": "file_loading",
	}

	total_files = len(selected_files)
	start_time = time.time()

	# Parse repo name from URL
	if "github.com" in repo_url:
	repo_name = (
	repo_url.replace("https://github.com/", "")
	.replace("http://github.com/", "")
	.strip("/")
	)
	if "/" not in repo_name:
	return {
	"status": "error",
	"message": "❌ Invalid repository URL format",
	"progress": 0,
	"details": "",
	"step": "file_loading",
	}
	else:
	repo_name = repo_url.strip()

	try:
	batch_size = 25
	all_documents = []
	all_failed = []

	current_progress.update(
	{
	"status": "loading",
	"message": f"🚀 Loading files from {repo_name}",
	"progress": 0,
	"total_files": total_files,
	"processed_files": 0,
	"phase": "File Loading",
	"details": f"Processing {total_files} files in batches...",
	"step": "file_loading",
	}
	)

	for i in range(0, len(selected_files), batch_size):
	batch = selected_files[i : i + batch_size]

	print(f"\n📦 PROCESSING BATCH {i // batch_size + 1}")
	print(f" Files: {batch}")

	# Update progress for current batch
	progress_percentage = (i / total_files) * 100
	current_progress.update(
	{
	"progress": progress_percentage,
	"processed_files": i,
	"current_batch": i // batch_size + 1,
	"details": f"Loading batch {i // batch_size + 1}: {', '.join([f.split('/')[-1] for f in batch])}",
	}
	)

	try:
	documents, failed = load_github_files(
	repo_name=repo_name,
	file_paths=batch,
	branch="main",
	concurrent_requests=10,
	github_token=os.getenv("GITHUB_API_KEY"),
	)

	print("✅ Load results:")
	print(f" - Documents: {len(documents)}")
	print(f" - Failed: {len(failed)}")

	if documents:
	for j, doc in enumerate(documents):
	print(f" 📄 Doc {j + 1}: {doc.doc_id}")
	print(f" Size: {len(doc.text)} chars")

	# Ensure repo metadata is set
	if "repo" not in doc.metadata:
	doc.metadata["repo"] = repo_name
	print(f" ✅ Added repo metadata: {repo_name}")

	all_documents.extend(documents)
	all_failed.extend(failed)

	except Exception as batch_error:
	print(f"❌ Batch processing error: {batch_error}")
	all_failed.extend(batch)

	loading_time = time.time() - start_time

	# Store loaded documents in progress state for next step
	current_progress.update(
	{
	"status": "loaded",
	"message": f"✅ File Loading Complete! Loaded {len(all_documents)} documents",
	"progress": 100,
	"phase": "Files Loaded",
	"details": f"Successfully loaded {len(all_documents)} documents in {loading_time:.1f}s",
	"step": "file_loading_complete",
	"loaded_documents": all_documents, # Store documents for next step
	"failed_files": all_failed,
	"loading_time": loading_time,
	"repo_name": repo_name,
	}
	)

	return current_progress

	except Exception as e:
	total_time = time.time() - start_time
	error_msg = f"❌ File loading error after {total_time:.1f}s: {str(e)}"
	print(error_msg)

	current_progress.update(
	{
	"status": "error",
	"message": error_msg,
	"progress": 0,
	"phase": "Failed",
	"details": str(e),
	"error": str(e),
	"step": "file_loading",
	}
	)

	return current_progress


	def start_vector_ingestion(current_progress: Dict):
	"""Step 2: Ingest loaded documents into vector store"""
	print("\n🔄 STARTING VECTOR INGESTION STEP")

	# Check if we have loaded documents from previous step
	if current_progress.get("step") != "file_loading_complete":
	return {
	"status": "error",
	"message": "❌ No loaded documents found. Please load files first.",
	"progress": 0,
	"details": "",
	"step": "vector_ingestion",
	}

	all_documents = current_progress.get("loaded_documents", [])
	repo_name = current_progress.get("repo_name", "")

	if not all_documents:
	return {
	"status": "error",
	"message": "❌ No documents available for vector ingestion",
	"progress": 0,
	"details": "",
	"step": "vector_ingestion",
	}

	vector_start_time = time.time()

	# Update state for vector store phase
	current_progress.update(
	{
	"status": "vectorizing",
	"message": "🔄 Generating embeddings and storing in vector database",
	"progress": 0,
	"phase": "Vector Store Ingestion",
	"details": f"Processing {len(all_documents)} documents for embedding...",
	"step": "vector_ingestion",
	}
	)

	try:
	print("🔄 STARTING VECTOR STORE INGESTION")
	print(f" Repository: {repo_name}")
	print(f" Documents to process: {len(all_documents)}")

	# Call the async ingestion function with repo name
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	loop.run_until_complete(ingest_documents_async(all_documents, repo_name))
	finally:
	loop.close()

	vector_time = time.time() - vector_start_time
	loading_time = current_progress.get("loading_time", 0)
	total_time = loading_time + vector_time

	print(f"✅ Vector ingestion completed in {vector_time:.2f} seconds")

	failed_files_data = current_progress.get("failed_files", [])
	if isinstance(failed_files_data, list):
	failed_files_count = len(failed_files_data)
	else:
	failed_files_count = failed_files_data if isinstance(failed_files_data, int) else 0

	# Update final success state with repository update flag
	current_progress.update(
	{
	"status": "complete",
	"message": "✅ Complete Ingestion Pipeline Finished!",
	"progress": 100,
	"phase": "Complete",
	"details": f"Successfully processed {len(all_documents)} documents for {repo_name}",
	"step": "complete",
	"total_time": total_time,
	"documents_processed": len(all_documents),
	"failed_files_count": failed_files_count, # Use count instead of trying len()
	"failed_files": failed_files_data, # Keep original data
	"vector_time": vector_time,
	"loading_time": loading_time,
	"repo_name": repo_name,
	"repository_updated": True, # Flag to trigger repo list refresh
	}
	)

	return current_progress

	except Exception as ingest_error:
	vector_time = time.time() - vector_start_time
	print(f"❌ Vector ingestion failed after {vector_time:.2f} seconds")
	print(f"❌ Error: {ingest_error}")

	# Get failed files data safely
	failed_files_data = current_progress.get("failed_files", [])
	if isinstance(failed_files_data, list):
	failed_files_count = len(failed_files_data)
	else:
	failed_files_count = failed_files_data if isinstance(failed_files_data, int) else 0

	current_progress.update(
	{
	"status": "error",
	"message": "❌ Vector Store Ingestion Failed",
	"progress": 0,
	"phase": "Failed",
	"details": f"Error: {str(ingest_error)}",
	"error": str(ingest_error),
	"step": "vector_ingestion",
	"failed_files_count": failed_files_count,
	"failed_files": failed_files_data,
	}
	)

	return current_progress

	def start_file_loading_generator(
	repo_url: str, selected_files: List[str], current_progress: Dict
	):
	"""Step 1: Load files from GitHub with yield-based real-time updates"""

	print("\n🔄 STARTING FILE LOADING STEP")
	print(f"📍 Repository: {repo_url}")
	print(f"📋 Selected files: {len(selected_files)} files")

	if not selected_files:
	error_progress = {
	"status": "error",
	"message": "❌ No files selected for loading",
	"progress": 0,
	"details": "Please select at least one file to proceed.",
	"step": "file_loading",
	}
	yield error_progress
	return error_progress

	total_files = len(selected_files)
	start_time = time.time()

	# Parse repo name from URL
	if "github.com" in repo_url:
	repo_name = (
	repo_url.replace("https://github.com/", "")
	.replace("http://github.com/", "")
	.strip("/")
	)
	if "/" not in repo_name:
	error_progress = {
	"status": "error",
	"message": "❌ Invalid repository URL format",
	"progress": 0,
	"details": "Expected format: owner/repo or https://github.com/owner/repo",
	"step": "file_loading",
	}
	yield error_progress
	return error_progress
	else:
	repo_name = repo_url.strip()

	try:
	batch_size = 10
	all_documents = []
	all_failed = []

	# Initial progress update
	initial_progress = {
	"status": "loading",
	"message": f"🚀 Starting file loading from {repo_name}",
	"progress": 0,
	"total_files": total_files,
	"processed_files": 0,
	"successful_files": 0,
	"failed_files": 0,
	"phase": "File Loading",
	"details": f"Preparing to load {total_files} files in batches of {batch_size}...",
	"step": "file_loading",
	"current_batch": 0,
	"total_batches": (len(selected_files) + batch_size - 1) // batch_size,
	"repo_name": repo_name,
	}
	yield initial_progress

	time.sleep(0.5)

	for i in range(0, len(selected_files), batch_size):
	batch = selected_files[i : i + batch_size]
	current_batch_num = i // batch_size + 1
	total_batches = (len(selected_files) + batch_size - 1) // batch_size

	# Update progress at batch start
	batch_start_progress = {
	"status": "loading",
	"message": f"🔄 Loading batch {current_batch_num}/{total_batches}",
	"progress": (i / total_files) * 90,
	"processed_files": i,
	"successful_files": len(all_documents),
	"failed_files": len(all_failed),
	"current_batch": current_batch_num,
	"total_batches": total_batches,
	"phase": "File Loading",
	"details": f"Processing batch {current_batch_num}: {', '.join([f.split('/')[-1] for f in batch[:3]])}{'...' if len(batch) > 3 else ''}",
	"step": "file_loading",
	"repo_name": repo_name,
	}
	yield batch_start_progress

	try:
	print(f"\n📦 PROCESSING BATCH {current_batch_num}/{total_batches}")
	print(f" Files: {[f.split('/')[-1] for f in batch]}")

	documents, failed = load_github_files(
	repo_name=repo_name,
	file_paths=batch,
	branch="main",
	concurrent_requests=10,
	github_token=os.getenv("GITHUB_API_KEY"),
	)

	print("✅ Load results:")
	print(f" - Documents: {len(documents)}")
	print(f" - Failed: {len(failed)}")

	# Process documents
	for j, doc in enumerate(documents):
	print(f" 📄 Doc {j + 1}: {doc.doc_id}")
	print(f" Size: {len(doc.text)} chars")

	if "repo" not in doc.metadata:
	doc.metadata["repo"] = repo_name
	print(f" ✅ Added repo metadata: {repo_name}")

	all_documents.extend(documents)
	all_failed.extend(failed)

	# Update progress after batch completion
	batch_complete_progress = {
	"status": "loading",
	"message": f"✅ Completed batch {current_batch_num}/{total_batches}",
	"progress": ((i + len(batch)) / total_files) * 90,
	"processed_files": i + len(batch),
	"successful_files": len(all_documents),
	"failed_files": len(all_failed),
	"current_batch": current_batch_num,
	"total_batches": total_batches,
	"phase": "File Loading",
	"details": f"✅ Batch {current_batch_num} complete: {len(documents)} loaded, {len(failed)} failed. Total progress: {len(all_documents)} documents loaded.",
	"step": "file_loading",
	"repo_name": repo_name,
	}
	yield batch_complete_progress

	time.sleep(0.3)

	except Exception as batch_error:
	print(f"❌ Batch processing error: {batch_error}")
	all_failed.extend(batch)

	error_progress = {
	"status": "loading",
	"message": f"⚠️ Error in batch {current_batch_num}",
	"progress": ((i + len(batch)) / total_files) * 90,
	"processed_files": i + len(batch),
	"successful_files": len(all_documents),
	"failed_files": len(all_failed),
	"current_batch": current_batch_num,
	"phase": "File Loading",
	"details": f"❌ Batch {current_batch_num} error: {str(batch_error)[:100]}... Continuing with next batch.",
	"step": "file_loading",
	"repo_name": repo_name,
	}
	yield error_progress

	loading_time = time.time() - start_time

	# Final completion update
	completion_progress = {
	"status": "loaded",
	"message": f"✅ File Loading Complete! Loaded {len(all_documents)} documents",
	"progress": 100,
	"phase": "Files Loaded Successfully",
	"details": f"🎯 Final Results:\n✅ Successfully loaded: {len(all_documents)} documents\n❌ Failed files: {len(all_failed)}\n⏱️ Total time: {loading_time:.1f}s\n📊 Success rate: {(len(all_documents)/(len(all_documents)+len(all_failed))*100):.1f}%",
	"step": "file_loading_complete",
	"loaded_documents": all_documents,
	"failed_files": all_failed,
	"loading_time": loading_time,
	"repo_name": repo_name,
	"total_files": total_files,
	"processed_files": total_files,
	"successful_files": len(all_documents),
	}
	yield completion_progress
	return completion_progress

	except Exception as e:
	total_time = time.time() - start_time
	error_msg = f"❌ File loading error after {total_time:.1f}s: {str(e)}"
	print(error_msg)

	error_progress = {
	"status": "error",
	"message": error_msg,
	"progress": 0,
	"phase": "Loading Failed",
	"details": f"Critical error during file loading:\n{str(e)}",
	"error": str(e),
	"step": "file_loading",
	}
	yield error_progress
	return error_progress

	# Progress display component
	def format_progress_display(progress_state: Dict) -> str:
	"""Format progress state into readable display with enhanced details"""
	if not progress_state:
	return "🚀 Ready to start ingestion...\n\n📋 Two-Step Process:\n1️⃣ Load files from GitHub repository\n2️⃣ Generate embeddings and store in vector database"

	status = progress_state.get("status", "unknown")
	message = progress_state.get("message", "")
	progress = progress_state.get("progress", 0)
	phase = progress_state.get("phase", "")
	details = progress_state.get("details", "")

	# Enhanced progress bar
	filled = int(progress / 2.5) # 40 chars total
	progress_bar = "█" * filled + "░" * (40 - filled)

	# Status emoji mapping
	status_emoji = {
	"loading": "⏳",
	"loaded": "✅",
	"vectorizing": "🧠",
	"complete": "🎉",
	"error": "❌"
	}

	emoji = status_emoji.get(status, "🔄")

	output = f"{emoji} {message}\n\n"

	# Phase and progress section
	output += f"📊 Current Phase: {phase}\n"
	output += f"📈 Progress: {progress:.1f}%\n"
	output += f"[{progress_bar}] {progress:.1f}%\n\n"

	# Step-specific details for file loading
	if progress_state.get("step") == "file_loading":
	processed = progress_state.get("processed_files", 0)
	total = progress_state.get("total_files", 0)
	successful = progress_state.get("successful_files", 0)
	failed = progress_state.get("failed_files", 0)

	if total > 0:
	output += "📁 File Processing Status:\n"
	output += f" • Total files: {total}\n"
	output += f" • Processed: {processed}/{total}\n"
	output += f" • ✅ Successful: {successful}\n"
	output += f" • ❌ Failed: {failed}\n"

	if "current_batch" in progress_state and "total_batches" in progress_state:
	output += f" • 📦 Current batch: {progress_state['current_batch']}/{progress_state['total_batches']}\n"
	output += "\n"

	# Step-specific details for vector ingestion
	elif progress_state.get("step") == "vector_ingestion":
	docs_count = progress_state.get("documents_count", 0)
	repo_name = progress_state.get("repo_name", "Unknown")

	if docs_count > 0:
	output += "🧠 Vector Processing Status:\n"
	output += f" • Repository: {repo_name}\n"
	output += f" • Documents: {docs_count:,}\n"
	output += f" • Stage: {phase}\n\n"

	# Detailed information
	output += f"📝 Details:\n{details}\n"

	# Final summary for completion
	if status == "complete":
	total_time = progress_state.get("total_time", 0)
	docs_processed = progress_state.get("documents_processed", 0)
	failed_files = progress_state.get("failed_files", 0)
	vector_time = progress_state.get("vector_time", 0)
	loading_time = progress_state.get("loading_time", 0)
	repo_name = progress_state.get("repo_name", "Unknown")

	output += "\n🎊 INGESTION COMPLETED SUCCESSFULLY!\n"
	output += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
	output += f"🎯 Repository: {repo_name}\n"
	output += f"📄 Documents processed: {docs_processed:,}\n"
	output += f"❌ Failed files: {len(failed_files) if isinstance(failed_files, list) else failed_files}\n"
	output += f"⏱️ Total time: {total_time:.1f} seconds\n"
	output += f" ├─ File loading: {loading_time:.1f}s\n"
	output += f" └─ Vector processing: {vector_time:.1f}s\n"
	output += f"📊 Processing rate: {docs_processed/total_time:.1f} docs/second\n\n"
	output += "🚀 Next Step: Go to the 'Query Interface' tab to start asking questions!"

	elif status == "error":
	error = progress_state.get("error", "Unknown error")
	output += "\n💥 ERROR OCCURRED\n"
	output += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
	output += f"❌ Error Details: {error[:300]}{'...' if len(error) > 300 else ''}\n"
	output += "\n🔧 Troubleshooting Tips:\n"
	output += " • Check your GitHub token permissions\n"
	output += " • Verify repository URL format\n"
	output += " • Ensure selected files exist\n"
	output += " • Check network connectivity\n"

	return output


	# Create the main Gradio interface
	with gr.Blocks(title="Doc-MCP") as demo:
	gr.Markdown("# 📚Doc-MCP: Documentation RAG System")
	gr.Markdown(
	"Transform GitHub documentation repositories into accessible MCP (Model Context Protocol) servers for AI agents. Upload documentation, generate vector embeddings, and query with intelligent context retrieval."
	)

	# State variables
	files_state = gr.State([])
	progress_state = gr.State({})

	with gr.Tabs():
	with gr.TabItem("📥 Documentation Ingestion"):
	gr.Markdown("### 🚀 Two-Step Documentation Processing Pipeline")
	gr.Markdown(
	"Step 1: Fetch markdown files from GitHub repository → Step 2: Generate vector embeddings and store in MongoDB Atlas"
	)

	with gr.Row():
	with gr.Column(scale=2):
	repo_input = gr.Textbox(
	label="📂 GitHub Repository URL",
	placeholder="Enter: owner/repo or https://github.com/owner/repo (e.g., gradio-app/gradio)",
	value="",
	info="Enter any GitHub repository containing markdown documentation"
	)
	load_btn = gr.Button("🔍 Discover Documentation Files", variant="secondary")

	with gr.Column(scale=1):
	status_output = gr.Textbox(
	label="Repository Discovery Status", interactive=False, lines=4,
	placeholder="Repository scanning results will appear here..."
	)
	with gr.Row():
	select_all_btn = gr.Button("📋 Select All Documents", variant="secondary")
	clear_all_btn = gr.Button("🗑️ Clear Selection", variant="secondary")

	# File selection
	with gr.Accordion(label="Available Documentation Files"):
	file_selector = gr.CheckboxGroup(
	choices=[], label="Select Markdown Files for RAG Processing", visible=False
	)

	# Two-step ingestion controls
	gr.Markdown("### 🔄 RAG Pipeline Execution")
	gr.Markdown("Process your documentation through our advanced RAG pipeline using Nebius AI embeddings and MongoDB Atlas vector storage.")

	with gr.Row():
	with gr.Column():
	step1_btn = gr.Button(
	"📥 Step 1: Load Files from GitHub",
	variant="primary",
	size="lg",
	interactive=False,
	)

	with gr.Column():
	step2_btn = gr.Button(
	"🔄 Step 2: Start Ingestion",
	variant="primary",
	size="lg",
	interactive=False,
	)

	with gr.Row():
	refresh_btn = gr.Button("🔄 Refresh Progress", variant="secondary")
	reset_btn = gr.Button("🗑️ Reset Progress", variant="secondary")

	# Progress display
	progress_display = gr.Textbox(
	label="📊 Real-time Ingestion Progress",
	interactive=False,
	lines=25,
	value="🚀 Ready to start two-step ingestion process...\n\n📋 Steps:\n1️⃣ Load files from GitHub repository\n2️⃣ Generate embeddings and store in vector database",
	max_lines=30,
	show_copy_button=True,
	)

	# Event handlers
	def load_files_handler(repo_url: str):
	if not repo_url.strip():
	return (
	gr.CheckboxGroup(choices=[], visible=False),
	"Please enter a repository URL",
	[],
	gr.Button(interactive=False),
	gr.Button(interactive=False),
	)

	files, message = fetch_files_with_loader(repo_url)

	if files:
	return (
	gr.CheckboxGroup(
	choices=files,
	value=[],
	label=f"Select Files from {repo_url} ({len(files)} files)",
	visible=True,
	),
	message,
	files,
	gr.Button(interactive=True), # Enable step 1 button
	gr.Button(interactive=False), # Keep step 2 disabled
	)
	else:
	return (
	gr.CheckboxGroup(choices=[], visible=False),
	message,
	[],
	gr.Button(interactive=False),
	gr.Button(interactive=False),
	)

	def start_step1_generator(repo_url: str, selected_files: List[str], current_progress: Dict):
	"""Start Step 1 with generator-based real-time progress updates"""
	for progress_update in start_file_loading_generator(repo_url, selected_files, current_progress.copy()):
	progress_text = format_progress_display(progress_update)
	step2_enabled = progress_update.get("step") == "file_loading_complete"

	yield (
	progress_update,
	progress_text,
	gr.Button(interactive=step2_enabled),
	)

	def start_step2(current_progress: Dict):
	"""Start Step 2: Vector Ingestion"""
	new_progress = start_vector_ingestion(current_progress.copy())
	progress_text = format_progress_display(new_progress)
	return new_progress, progress_text

	def refresh_progress(current_progress: Dict):
	"""Refresh the progress display"""
	progress_text = format_progress_display(current_progress)
	return progress_text

	def reset_progress():
	"""Reset all progress"""
	return (
	{},
	"Ready to start two-step ingestion process...",
	gr.Button(interactive=False),
	)

	def select_all_handler(available_files):
	if available_files:
	return gr.CheckboxGroup(value=available_files)
	return gr.CheckboxGroup(value=[])

	def clear_all_handler():
	return gr.CheckboxGroup(value=[])

	# Wire up events
	load_btn.click(
	fn=load_files_handler,
	inputs=[repo_input],
	outputs=[
	file_selector,
	status_output,
	files_state,
	step1_btn,
	step2_btn,
	],
	show_api=False,
	)

	select_all_btn.click(
	fn=select_all_handler,
	inputs=[files_state],
	outputs=[file_selector],
	show_api=False,
	)

	clear_all_btn.click(
	fn=clear_all_handler, outputs=[file_selector], show_api=False
	)

	step1_btn.click(
	fn=start_step1_generator,
	inputs=[repo_input, file_selector, progress_state],
	outputs=[progress_state, progress_display, step2_btn],
	show_api=False,
	)

	step2_btn.click(
	fn=start_step2,
	inputs=[progress_state],
	outputs=[progress_state, progress_display],
	show_api=False,
	)

	refresh_btn.click(
	fn=refresh_progress,
	inputs=[progress_state],
	outputs=[progress_display],
	show_api=False,
	)

	reset_btn.click(
	fn=reset_progress,
	outputs=[progress_state, progress_display, step2_btn],
	show_api=False,
	)

	# ================================
	# Tab 2: Query Interface
	# ================================
	with gr.TabItem("🤖 AI Documentation Assistant"):
	gr.Markdown("### 💬 Intelligent Documentation Q&A")
	gr.Markdown(
	"Query your processed documentation using advanced semantic search. Get contextual answers with source citations powered by Nebius LLM and vector similarity search."
	)

	with gr.Row():
	with gr.Column(scale=2):
	# Repository selection - Dropdown that becomes textbox when selected
	with gr.Row():
	repo_dropdown = gr.Dropdown(
	choices=get_available_repositories() or ["No repositories available"],
	label="📚 Select Documentation Repository",
	value=None,
	interactive=True,
	allow_custom_value=True,
	info="Choose from available repositories"
	)

	# Hidden textbox that will become visible when repo is selected
	selected_repo_textbox = gr.Textbox(
	label="🎯 Selected Repository",
	value="",
	interactive=False,
	visible=False,
	info="Currently selected repository for querying"
	)

	refresh_repos_btn = gr.Button(
	"🔄 Refresh Repository List", variant="secondary", size="sm"
	)

	# Query mode selection
	query_mode = gr.Radio(
	choices=["default", "text_search", "hybrid"],
	label="🔍 Search Strategy",
	value="default",
	info="• default: Semantic similarity (AI understanding)\n• text_search: Keyword matching\n• hybrid: Combined approach for best results",
	)

	# Query input
	query_input = gr.Textbox(
	label="💭 Ask About Your Documentation",
	placeholder="How do I implement a custom component? What are the available API endpoints? How to configure the system?",
	lines=3,
	info="Ask natural language questions about your documentation"
	)

	query_btn = gr.Button("🚀 Search Documentation", variant="primary", size="lg")

	# Response display as text area
	response_output = gr.Textbox(
	label="🤖 AI Assistant Response",
	value="Your AI-powered documentation response will appear here with contextual information and source citations...",
	lines=10,
	interactive=False,
	info="Generated using Nebius LLM with retrieved documentation context"
	)

	with gr.Column(scale=2):
	gr.Markdown("### 📖 Source References")
	gr.Markdown("View the exact documentation sources used to generate the response, with relevance scores and GitHub links.")

	# Source nodes display as JSON
	sources_output = gr.JSON(
	label="📎 Source Citations & Metadata",
	value={
	"message": "Source documentation excerpts with relevance scores will appear here after your query...",
	"info": "Each source includes file path, relevance score, and content snippet"
	},
	)

	# Event handlers
	def handle_repo_selection(selected_repo):
	"""Handle repository selection from dropdown"""
	if not selected_repo or selected_repo in ["No repositories available", ""]:
	return (
	gr.Dropdown(visible=True), # Keep dropdown visible
	gr.Textbox(visible=False, value=""), # Hide textbox
	gr.Button(interactive=False) # Disable query button
	)
	else:
	return (
	gr.Dropdown(visible=False), # Hide dropdown
	gr.Textbox(visible=True, value=selected_repo), # Show textbox with selected repo
	gr.Button(interactive=True) # Enable query button
	)

	def reset_repo_selection():
	"""Reset to show dropdown again"""
	try:
	repos = get_available_repositories() or ["No repositories available"]
	return (
	gr.Dropdown(choices=repos, value=None, visible=True), # Show dropdown with refreshed choices
	gr.Textbox(visible=False, value=""), # Hide textbox
	gr.Button(interactive=False) # Disable query button
	)
	except Exception as e:
	print(f"Error refreshing repository list: {e}")
	return (
	gr.Dropdown(choices=["Error loading repositories"], value=None, visible=True),
	gr.Textbox(visible=False, value=""),
	gr.Button(interactive=False)
	)

	def get_available_docs_repo():
	"""
	List the available docs of repositories - should be called first to list out all the available repo docs to chat with

	Returns:
	Updated dropdown with available repositories
	"""
	try:
	repos = get_available_repositories()
	if not repos:
	repos = ["No repositories available - Please ingest documentation first"]
	return gr.Dropdown(choices=repos, value=None)
	except Exception as e:
	print(f"Error refreshing repository list: {e}")
	return gr.Dropdown(choices=["Error loading repositories"], value=None)

	# Simple query handler
	def handle_query(repo: str, mode: str, query: str):
	"""
	Handle query request - returns raw data from retriever
	Args:
	repo: Selected repository from textbox
	mode: Query mode (default, text_search, hybrid)
	query: User's query
	Returns:
	Raw result dict from QueryRetriever.make_query()
	"""
	if not query.strip():
	return {"error": "Please enter a query."}

	if not repo or repo in ["No repositories available", "Error loading repositories", ""]:
	return {"error": "Please select a valid repository."}

	try:

	# Create query retriever for the selected repo
	retriever = QueryRetriever(repo)

	# Make the query and return raw result
	result = retriever.make_query(query, mode)
	return result

	except Exception as e:
	print(f"Query error: {e}")
	traceback.print_exc()
	return {"error": f"Query failed: {str(e)}"}

	def make_query(repo: str, mode: str, query: str):
	"""
	Retrieve relevant documentation context for a given query using specified retrieval mode.

	This function is designed to support Retrieval-Augmented Generation (RAG) by extracting
	the most relevant context chunks from indexed documentation sources.
	Args:
	repo: Selected repository from the textbox input
	mode: Query mode (default, text_search, hybrid)
	query: User's query
	Returns:
	Tuple of (response_text, source_nodes_json)
	"""
	# Get raw result
	result = handle_query(repo, mode, query)

	# Extract response text
	if "error" in result:
	response_text = f"Error: {result['error']}"
	source_nodes = {"error": result["error"]}
	else:
	response_text = result.get("response", "No response available")
	source_nodes = result.get("source_nodes", [])

	return response_text, source_nodes

	# Wire up events

	# Handle repository selection from dropdown
	repo_dropdown.change(
	fn=handle_repo_selection,
	inputs=[repo_dropdown],
	outputs=[repo_dropdown, selected_repo_textbox, query_btn],
	show_api=False
	)

	# Handle refresh button - resets to dropdown view
	refresh_repos_btn.click(
	fn=reset_repo_selection,
	outputs=[repo_dropdown, selected_repo_textbox, query_btn],
	show_api=False
	)

	# Also provide API endpoint for listing repositories
	refresh_repos_btn.click(
	fn=get_available_docs_repo,
	outputs=[repo_dropdown],
	api_name="list_available_docs",
	)

	# Query button uses the textbox value (not dropdown)
	query_btn.click(
	fn=make_query,
	inputs=[selected_repo_textbox, query_mode, query_input], # Use textbox, not dropdown
	outputs=[response_output, sources_output],
	api_name="query_documentation",
	)

	# Also allow Enter key to trigger query
	query_input.submit(
	fn=make_query,
	inputs=[selected_repo_textbox, query_mode, query_input], # Use textbox, not dropdown
	outputs=[response_output, sources_output],
	show_api=False,
	)

	# ================================
	# Tab 3: Repository Management
	# ================================
	with gr.TabItem("🗂️ Repository Management"):
	gr.Markdown("Manage your ingested repositories - view details and delete repositories when needed.")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📊 Repository Statistics")
	stats_display = gr.JSON(
	label="Database Statistics",
	value={"message": "Click refresh to load statistics..."}
	)
	refresh_stats_btn = gr.Button("🔄 Refresh Statistics", variant="secondary")

	with gr.Column(scale=2):
	gr.Markdown("### 📋 Repository Details")
	repos_table = gr.Dataframe(
	headers=["Repository", "Files", "Last Updated"],
	datatype=["str", "number", "str"],
	label="Ingested Repositories",
	interactive=False,
	wrap=True
	)
	refresh_repos_btn = gr.Button("🔄 Refresh Repository List", variant="secondary")

	gr.Markdown("### 🗑️ Delete Repository")
	gr.Markdown("⚠️ Warning: This will permanently delete all documents and metadata for the selected repository.")

	with gr.Row():
	with gr.Column(scale=2):
	delete_repo_dropdown = gr.Dropdown(
	choices=[],
	label="Select Repository to Delete",
	value=None,
	interactive=True,
	allow_custom_value=False,
	)

	# Confirmation checkbox
	confirm_delete = gr.Checkbox(
	label="I understand this action cannot be undone",
	value=False
	)

	delete_btn = gr.Button(
	"🗑️ Delete Repository",
	variant="stop",
	size="lg",
	interactive=False
	)

	with gr.Column(scale=1):
	deletion_status = gr.Textbox(
	label="Deletion Status",
	value="Select a repository and confirm to enable deletion.",
	interactive=False,
	lines=6
	)

	# Management functions
	def load_repository_stats():
	"""Load overall repository statistics"""
	try:

	stats = get_repository_stats()
	return stats
	except Exception as e:
	return {"error": f"Failed to load statistics: {str(e)}"}

	def load_repository_details():
	"""Load detailed repository information as a table"""
	try:

	details = get_repo_details()

	if not details:
	return [["No repositories found", 0, "N/A"]]

	# Format for dataframe
	table_data = []
	for repo in details:
	last_updated = repo.get("last_updated", "Unknown")
	if hasattr(last_updated, 'strftime'):
	last_updated = last_updated.strftime("%Y-%m-%d %H:%M")
	elif last_updated != "Unknown":
	last_updated = str(last_updated)

	table_data.append([
	repo.get("repo_name", "Unknown"),
	repo.get("file_count", 0),
	last_updated
	])

	return table_data

	except Exception as e:
	return [["Error loading repositories", 0, str(e)]]

	def update_delete_dropdown():
	"""Update the dropdown with available repositories"""
	try:
	repos = get_available_repositories()
	return gr.Dropdown(choices=repos, value=None)
	except Exception as e:
	print(f"Error updating delete dropdown: {e}")
	return gr.Dropdown(choices=[], value=None)

	def check_delete_button_state(repo_selected, confirmation_checked):
	"""Enable/disable delete button based on selection and confirmation"""
	if repo_selected and confirmation_checked:
	return gr.Button(interactive=True)
	else:
	return gr.Button(interactive=False)

	def delete_repository(repo_name: str, confirmed: bool):
	"""Delete the selected repository"""
	if not repo_name:
	return "❌ No repository selected.", gr.Dropdown(choices=[]), gr.Checkbox(value=False)

	if not confirmed:
	return "❌ Please confirm deletion by checking the checkbox.", gr.Dropdown(choices=[]), gr.Checkbox(value=False)

	try:


	# Perform deletion
	result = delete_repository_data(repo_name)

	# Prepare status message
	status_msg = result["message"]
	if result["success"]:
	status_msg += "\n\n📊 Deletion Summary:"
	status_msg += f"\n- Vector documents removed: {result['vector_docs_deleted']}"
	status_msg += f"\n- Repository record deleted: {'Yes' if result['repo_record_deleted'] else 'No'}"
	status_msg += f"\n\n✅ Repository '{repo_name}' has been completely removed."

	# Update dropdown (remove deleted repo)
	updated_dropdown = update_delete_dropdown()

	# Reset confirmation checkbox
	reset_checkbox = gr.Checkbox(value=False)

	return status_msg, updated_dropdown, reset_checkbox

	except Exception as e:
	error_msg = f"❌ Error deleting repository: {str(e)}"
	return error_msg, gr.Dropdown(choices=[]), gr.Checkbox(value=False)

	# Wire up management events
	refresh_stats_btn.click(
	fn=load_repository_stats,
	outputs=[stats_display],
	show_api=False
	)

	refresh_repos_btn.click(
	fn=load_repository_details,
	outputs=[repos_table],
	show_api=False
	)

	# Update delete dropdown when refreshing repos
	refresh_repos_btn.click(
	fn=update_delete_dropdown,
	outputs=[delete_repo_dropdown],
	show_api=False
	)

	# Enable/disable delete button based on selection and confirmation
	delete_repo_dropdown.change(
	fn=check_delete_button_state,
	inputs=[delete_repo_dropdown, confirm_delete],
	outputs=[delete_btn],
	show_api=False
	)

	confirm_delete.change(
	fn=check_delete_button_state,
	inputs=[delete_repo_dropdown, confirm_delete],
	outputs=[delete_btn],
	show_api=False
	)

	# Delete repository
	delete_btn.click(
	fn=delete_repository,
	inputs=[delete_repo_dropdown, confirm_delete],
	outputs=[deletion_status, delete_repo_dropdown, confirm_delete],
	show_api=False
	)

	# Load data on tab load
	demo.load(
	fn=load_repository_stats,
	outputs=[stats_display],
	show_api=False
	)

	demo.load(
	fn=load_repository_details,
	outputs=[repos_table],
	show_api=False
	)

	demo.load(
	fn=update_delete_dropdown,
	outputs=[delete_repo_dropdown],
	show_api=False
	)

	# ================================
	# Tab 4: GitHub File Search (Hidden API)
	# ================================
	with gr.TabItem("🔍 GitHub File Search", visible=False):
	gr.Markdown("### 🔧 GitHub Repository File Search API")
	gr.Markdown("Pure API endpoints for GitHub file operations - all responses in JSON format")

	with gr.Row():
	with gr.Column():
	gr.Markdown("#### 📋 List Repository Files")

	# Repository input for file operations
	api_repo_input = gr.Textbox(
	label="Repository URL",
	placeholder="owner/repo or https://github.com/owner/repo",
	value="",
	info="GitHub repository to scan"
	)

	# Branch selection
	api_branch_input = gr.Textbox(
	label="Branch",
	value="main",
	placeholder="main",
	info="Branch to search (default: main)"
	)

	# File extensions
	api_extensions_input = gr.Textbox(
	label="File Extensions (comma-separated)",
	value=".md,.mdx",
	placeholder=".md,.mdx,.txt",
	info="File extensions to include"
	)

	# List files button
	list_files_btn = gr.Button("📋 List Files", variant="primary")

	with gr.Column():
	gr.Markdown("#### 📄 Get Single File")

	# Single file inputs
	single_repo_input = gr.Textbox(
	label="Repository URL",
	placeholder="owner/repo or https://github.com/owner/repo",
	value="",
	info="GitHub repository"
	)

	single_file_input = gr.Textbox(
	label="File Path",
	placeholder="docs/README.md",
	value="",
	info="Path to specific file in repository"
	)

	single_branch_input = gr.Textbox(
	label="Branch",
	value="main",
	placeholder="main",
	info="Branch name (default: main)"
	)

	# Get single file button
	get_single_btn = gr.Button("📄 Get Single File", variant="secondary")

	with gr.Row():
	with gr.Column():
	gr.Markdown("#### 📚 Get Multiple Files")

	# Multiple files inputs
	multiple_repo_input = gr.Textbox(
	label="Repository URL",
	placeholder="owner/repo or https://github.com/owner/repo",
	value="",
	info="GitHub repository"
	)

	multiple_files_input = gr.Textbox(
	label="File Paths (comma-separated)",
	placeholder="README.md,docs/guide.md,api/overview.md",
	value="",
	lines=3,
	info="Comma-separated list of file paths"
	)

	multiple_branch_input = gr.Textbox(
	label="Branch",
	value="main",
	placeholder="main",
	info="Branch name (default: main)"
	)

	# Get multiple files button
	get_multiple_btn = gr.Button("📚 Get Multiple Files", variant="secondary")

	# Single JSON output for all operations
	gr.Markdown("### 📊 API Response")
	api_response_output = gr.JSON(
	label="JSON Response",
	value={
	"message": "API responses will appear here",
	"info": "Use the buttons above to interact with GitHub repositories"
	}
	)

	# Pure API Functions (JSON only responses)
	def list_repository_files(repo_url: str, branch: str = "main", extensions: str = ".md,.mdx"):
	"""
	List all files in a GitHub repository with specified extensions

	Args:
	repo_url: GitHub repository URL or owner/repo format
	branch: Branch name to search (default: main)
	extensions: Comma-separated file extensions (default: .md,.mdx)

	Returns:
	JSON response with file list and metadata
	"""
	try:
	if not repo_url.strip():
	return {"success": False, "error": "Repository URL is required"}

	# Parse extensions list
	ext_list = [ext.strip() for ext in extensions.split(",") if ext.strip()]
	if not ext_list:
	ext_list = [".md", ".mdx"]


	# Get files list
	files, status_message = fetch_repository_files(
	repo_url=repo_url,
	file_extensions=ext_list,
	github_token=os.getenv("GITHUB_API_KEY"),
	branch=branch
	)

	if files:
	return {
	"success": True,
	"repository": repo_url,
	"branch": branch,
	"extensions": ext_list,
	"total_files": len(files),
	"files": files,
	"status": status_message
	}
	else:
	return {
	"success": False,
	"repository": repo_url,
	"branch": branch,
	"extensions": ext_list,
	"total_files": 0,
	"files": [],
	"error": status_message or "No files found"
	}

	except Exception as e:
	return {
	"success": False,
	"error": f"Failed to list files: {str(e)}",
	"repository": repo_url,
	"branch": branch
	}

	def get_single_file(repo_url: str, file_path: str, branch: str = "main"):
	"""
	Retrieve a single file from GitHub repository

	Args:
	repo_url: GitHub repository URL or owner/repo format
	file_path: Path to the file in the repository
	branch: Branch name (default: main)

	Returns:
	JSON response with file content and metadata
	"""
	try:
	if not repo_url.strip():
	return {"success": False, "error": "Repository URL is required"}

	if not file_path.strip():
	return {"success": False, "error": "File path is required"}

	# Parse repo name
	if "github.com" in repo_url:
	repo_name = (
	repo_url.replace("https://github.com/", "")
	.replace("http://github.com/", "")
	.strip("/")
	)
	else:
	repo_name = repo_url.strip()

	# Load single file
	documents, failed = load_github_files(
	repo_name=repo_name,
	file_paths=[file_path.strip()],
	branch=branch,
	github_token=os.getenv("GITHUB_API_KEY")
	)

	if documents and len(documents) > 0:
	doc = documents[0]
	return {
	"success": True,
	"repository": repo_name,
	"branch": branch,
	"file_path": file_path,
	"file_name": doc.metadata.get("file_name", ""),
	"file_size": len(doc.text),
	"content": doc.text,
	"metadata": doc.metadata,
	"url": doc.metadata.get("url", ""),
	"raw_url": doc.metadata.get("raw_url", "")
	}
	else:
	error_msg = f"Failed to retrieve file: {failed[0] if failed else 'File not found or access denied'}"
	return {
	"success": False,
	"repository": repo_name,
	"branch": branch,
	"file_path": file_path,
	"error": error_msg
	}

	except Exception as e:
	return {
	"success": False,
	"error": f"Failed to get single file: {str(e)}",
	"repository": repo_url,
	"file_path": file_path,
	"branch": branch
	}

	def get_multiple_files(repo_url: str, file_paths_str: str, branch: str = "main"):
	"""
	Retrieve multiple files from GitHub repository

	Args:
	repo_url: GitHub repository URL or owner/repo format
	file_paths_str: Comma-separated string of file paths
	branch: Branch name (default: main)

	Returns:
	JSON response with multiple file contents and metadata
	"""
	try:
	if not repo_url.strip():
	return {"success": False, "error": "Repository URL is required"}

	if not file_paths_str.strip():
	return {"success": False, "error": "File paths are required"}

	# Parse file paths from comma-separated string
	file_paths = [path.strip() for path in file_paths_str.split(",") if path.strip()]

	if not file_paths:
	return {"success": False, "error": "No valid file paths provided"}

	# Parse repo name
	if "github.com" in repo_url:
	repo_name = (
	repo_url.replace("https://github.com/", "")
	.replace("http://github.com/", "")
	.strip("/")
	)
	else:
	repo_name = repo_url.strip()

	# Load multiple files
	documents, failed = load_github_files(
	repo_name=repo_name,
	file_paths=file_paths,
	branch=branch,
	github_token=os.getenv("GITHUB_API_KEY")
	)

	# Process successful documents
	successful_files = []
	for doc in documents:
	file_data = {
	"file_path": doc.metadata.get("file_path", ""),
	"file_name": doc.metadata.get("file_name", ""),
	"file_size": len(doc.text),
	"content": doc.text,
	"metadata": doc.metadata,
	"url": doc.metadata.get("url", ""),
	"raw_url": doc.metadata.get("raw_url", "")
	}
	successful_files.append(file_data)

	return {
	"success": True,
	"repository": repo_name,
	"branch": branch,
	"requested_files": len(file_paths),
	"successful_files": len(successful_files),
	"failed_files": len(failed),
	"files": successful_files,
	"failed_file_paths": failed,
	"total_content_size": sum(len(doc.text) for doc in documents),
	"requested_file_paths": file_paths
	}

	except Exception as e:
	return {
	"success": False,
	"error": f"Failed to get multiple files: {str(e)}",
	"repository": repo_url,
	"file_paths": file_paths_str,
	"branch": branch
	}

	# Wire up the GitHub file search events - all output to single JSON component
	list_files_btn.click(
	fn=list_repository_files,
	inputs=[api_repo_input, api_branch_input, api_extensions_input],
	outputs=[api_response_output],
	api_name="list_repository_files"
	)

	get_single_btn.click(
	fn=get_single_file,
	inputs=[single_repo_input, single_file_input, single_branch_input],
	outputs=[api_response_output],
	api_name="get_single_file"
	)

	get_multiple_btn.click(
	fn=get_multiple_files,
	inputs=[multiple_repo_input, multiple_files_input, multiple_branch_input],
	outputs=[api_response_output],
	api_name="get_multiple_files"
	)
	if __name__ == "__main__":
	demo.launch(mcp_server=True)