Spaces:

Penality
/

pdf-something

Sleeping

File size: 5,905 Bytes

fe36699
6dee028
924bf1b
cf10f44
61157d2
2270bc7
 
 
92757b3
 
9212ca3
 
 
fe36699
61157d2
a5bd9c0
fe36699
84b4386
4468b37
 
 
 
 
 
 
 
84b4386
2270bc7
 
 
 
 
c8d88b2
5dc8c5b
c8d88b2
 
7d978f8
c8d88b2
58ffb78
7d978f8
4ce21a8
 
 
 
c8d88b2
 
adec520
c8d88b2
adec520
 
 
7d978f8
c8d88b2
 
adec520
7d978f8
 
 
 
 
2270bc7
c8d88b2
7d978f8
 
c8d88b2
 
 
7d978f8
c8d88b2
 
7d978f8
c8d88b2
6dee028
c8d88b2
7d978f8
c8d88b2
 
7d978f8
c8d88b2
 
 
 
 
7d978f8
adec520
afa0a99
adec520
7d978f8
c8d88b2
 
 
 
2270bc7
 
 
9aee54a
fd0dd62
7d978f8
 
fd0dd62
c8d88b2
adec520
 
c8d88b2
 
7d978f8
 
 
 
 
 
2270bc7
 
cf10f44
 
20218cb
cf10f44
 
 
 
 
fe36699
cf10f44
 
20218cb
cf10f44
 
 
2270bc7
cf10f44
 
 
 
fe36699
cf10f44
61157d2
20218cb
61157d2
fe36699
cf10f44
 
20218cb
2270bc7
2a7ef32
 
 
 
 
fe36699
fd0dd62
 
 
2a7ef32
adec520
2a7ef32
 
 
 
 
9aee54a
2a7ef32
 
fe36699
d968fd4
9aee54a
49d0de6
d968fd4
 
 
 
 
 
 
5e8d963
d968fd4
 
 
61157d2
fe36699
5b4c39c
fe36699
cf10f44
 
 
 
 
61157d2
fe36699
 
cf10f44
7d978f8

import gradio as gr
import json
import os
import pdfplumber
import together
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"

# Use a sentence transformer for embeddings
#'BAAI/bge-base-en-v1.5'
# embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")  

# 'togethercomputer/m2-bert-80M-8k-retrieval'
embedding_model = SentenceTransformer(
    "togethercomputer/m2-bert-80M-8k-retrieval", 
    trust_remote_code=True  # Allow remote code execution
)
embedding_dim = 768  # Adjust according to model

# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_dim)
documents = []  # Store raw text for reference

# Initialize paths
DOCUMENT_DIR = os.path.join(os.path.dirname(__file__), "documents")
INDEX_FILE = "faiss_index.bin"  # FAISS index file (binary format)
METADATA_FILE = "metadata.json"  # Document metadata

# Create the documents directory if it doesn’t exist
os.makedirs(DOCUMENT_DIR, exist_ok=True)


print(os.getcwd())  # This will print the current working directory
print(os.listdir("."))  # This will show files in the current director

# Load FAISS index if it exists
if os.path.exists(INDEX_FILE):
    print(" FAISS index file exists")
    index = faiss.read_index(INDEX_FILE)
else:
    print(" No FAISS index found. Creating a new one.")
    index = faiss.IndexFlatL2(embedding_dim)  # Empty FAISS index

# Load metadata
if os.path.exists(METADATA_FILE):
    print("metadata exists")
    with open(METADATA_FILE, "r") as f:
        metadata = json.load(f)
else:
    metadata = {}

def store_document(text):
    print(" Storing document...")

    # Generate a unique filename
    doc_id = len(metadata) + 1
    filename = os.path.join(DOCUMENT_DIR, f"doc_{doc_id}.txt")
    print(f"Saving document at: {filename}")

    # Save document to file
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)
    print(" Document saved")

    # Generate and store embedding
    embedding = embedding_model.encode([text]).astype(np.float32)
    index.add(embedding)  # Add to FAISS index
    print(" Embeddings generated")

    # Get FAISS index for the new document
    doc_index = index.ntotal - 1

    # Update metadata with FAISS index
    metadata[str(doc_index)] = filename
    with open(METADATA_FILE, "w") as f:
        print(metadata)
        json.dump(metadata, f)
    print("saved Metadata")

    # Save FAISS index properly
    faiss.write_index(index, INDEX_FILE)

    print(f" Document stored successfully at: {filename}")
    return "Document stored!"

def retrieve_document(query):
    print(f"retrieving doc based on: \n{query}")

    query_embedding = embedding_model.encode([query]).astype(np.float32)
    _, closest_idx = index.search(query_embedding, 1)

    if not closest_idx or closest_idx[0][0] not in metadata:
        print("No relevant Document found")
        return None

    
    if closest_idx[0][0] in metadata:  # Ensure a valid match
        filename = metadata[str(closest_idx[0][0])]
        with open(filename, "r") as f:
            return f.read()
    else:
        return None


def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        store_document(text)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(pdf_file, user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")

    if pdf_file:
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_file)
        if not text:
            return "Could not extract any text from the PDF."

    # retrieve the document relevant to the query
    doc = retrieve_document(user_question)           
    
    if doc:
        print(f"found doc{doc}")
        # Split into smaller chunks
        chunks = split_text(doc)
      
        # Use only the first chunk (to optimize token usage)
        prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
        print(f"prompt: \n{prompt}")
    else:
              prompt=user_question

    try:
            print("asking")
            response = together.Completion.create(
                model="mistralai/Mistral-7B-Instruct-v0.1",
                prompt=prompt,
                max_tokens=200,
                temperature=0.7,
            )
        
            # Return chatbot's response
            return response.choices[0].text
    except  Exception as e:
        return f"Error generating response: {e}"
        
    # Send to Together.AI (Mistral-7B)



# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="PDF Q&A Chatbot (Powered by Together.AI)"
)

# Launch Gradio app
iface.launch()