Spaces:
Sleeping
Sleeping
File size: 3,934 Bytes
fe36699 924bf1b cf10f44 61157d2 2270bc7 42ada2f 92757b3 9212ca3 fe36699 61157d2 a5bd9c0 fe36699 10a8f5e 2270bc7 a06984f 2270bc7 cf10f44 20218cb cf10f44 fe36699 cf10f44 20218cb cf10f44 2270bc7 cf10f44 fe36699 cf10f44 61157d2 20218cb 61157d2 fe36699 cf10f44 20218cb cf10f44 61157d2 cf10f44 61157d2 2270bc7 61157d2 2270bc7 fe36699 61157d2 cf10f44 fe36699 d968fd4 49d0de6 d968fd4 5e8d963 d968fd4 61157d2 fe36699 5b4c39c fe36699 cf10f44 61157d2 fe36699 cf10f44 5f6bee3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
import os
import pdfplumber
import together
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import huggingface_hub as login
import re
import unicodedata
from dotenv import load_dotenv
load_dotenv()
# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"
# Retrieve the API token from secrets
api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if api_token:
login(api_token) # Authenticate with Hugging Face
# Load LLaMA-2 Model
llama_pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")
# Load Sentence Transformer for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Initialize FAISS index
embedding_dim = 384 # For MiniLM model
index = faiss.IndexFlatL2(embedding_dim)
documents = [] # Store raw text for reference
def store_document(text):
print("storing document")
embedding = embedding_model.encode([text])
index.add(np.array(embedding, dtype=np.float32))
documents.append(text)
print(f"your document has been stored: \n{documents}")
return "Document stored!"
def retrieve_document(query):
print(f"retrieving doc based on {query}")
query_embedding = embedding_model.encode([query])
_, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
print(f"retrieved: {documents[closest_idx[0][0]]}")
return documents[closest_idx[0][0]]
def clean_text(text):
"""Cleans extracted text for better processing by the model."""
print("cleaning")
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text) # Keep basic punctuation
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
return text
def extract_text_from_pdf(pdf_file):
"""Extract and clean text from the uploaded PDF."""
print("extracting")
try:
with pdfplumber.open(pdf_file) as pdf:
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
store_document(text)
return text
except Exception as e:
print(f"Error extracting text: {e}")
return None
def split_text(text, chunk_size=500):
"""Splits text into smaller chunks for better processing."""
print("splitting")
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def chatbot(pdf_file, user_question):
"""Processes the PDF and answers the user's question."""
print("chatbot start")
# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)
if not text:
return "Could not extract any text from the PDF."
# retrieve the document relevant to the query
doc = retrieve_document(user_question)
# Split into smaller chunks
chunks = split_text(doc)
# Use only the first chunk (to optimize token usage)
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
try:
response = together.Completion.create(
model="mistralai/Mistral-7B-Instruct-v0.1",
prompt=prompt,
max_tokens=200,
temperature=0.7,
)
# Return chatbot's response
return response.choices[0].text
except Exception as e:
return f"Error generating response: {e}"
# Send to Together.AI (Mistral-7B)
# Gradio Interface
iface = gr.Interface(
fn=chatbot,
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
outputs=gr.Textbox(label="Answer"),
title="PDF Q&A Chatbot (Powered by Together.AI)"
)
# Launch Gradio app
iface.launch()
|