Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pinecone | |
import os # To access environment variables | |
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
from datasets import load_dataset | |
# Step 1: Get the Pinecone API key from the environment variable (Hugging Face secret) | |
pinecone_api_key = os.getenv('PINECONE_API_KEY') # Fetch Pinecone API key from Hugging Face secrets | |
if not pinecone_api_key: | |
st.error("Pinecone API key not found! Make sure to set the secret in Hugging Face settings.") | |
st.stop() | |
# Initialize Pinecone client using the API key | |
pinecone.init(api_key=pinecone_api_key, environment="us-west1-gcp") # Change the environment if needed | |
# Connect to your Pinecone index | |
index_name = "legal-docs-index-dji2ip8" # Your Pinecone index name | |
index = pinecone.Index(index_name) | |
# Step 2: Load the sentence-transformers model for embeddings | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
# Step 3: Load dataset (for reference in your app) | |
dataset = load_dataset("macadeliccc/US-LegalKit", split="train") | |
law_texts = [item['text'] for item in dataset if 'text' in item] | |
# Step 4: Function to search Pinecone index | |
def search_pinecone(query, top_k=5): | |
# Create an embedding for the user's query | |
query_embedding = model.encode([query]) | |
# Query the Pinecone index for similar documents | |
results = index.query(query_embedding, top_k=top_k, include_metadata=True) | |
# Extract the text of the top-k results | |
return [match['metadata']['text'] for match in results['matches']] | |
# Step 5: Streamlit UI | |
st.title("π Legal AI Assistant (US-LegalKit)") | |
query = st.text_input("π Enter your legal query:") | |
if query: | |
# Get the top results from Pinecone | |
results = search_pinecone(query) | |
st.write("### π Relevant Legal Documents:") | |
for i, doc in enumerate(results, 1): | |
st.write(f"**{i}.** {doc[:500]}...") # Show preview of the document | |