Spaces:

Penality
/

pdf-something

Sleeping

File size: 6,072 Bytes

fe36699
6dee028
924bf1b
cf10f44
61157d2
2270bc7
 
 
92757b3
 
9212ca3
 
 
fe36699
61157d2
a5bd9c0
fe36699
84b4386
4468b37
 
 
 
 
 
 
 
1ed3cce
 
 
 
 
 
 
 
 
84b4386
2270bc7
 
 
 
1ed3cce
 
 
4ce21a8
c8d88b2
 
adec520
c8d88b2
adec520
 
 
7d978f8
c8d88b2
 
1ed3cce
7d978f8
 
 
 
 
2270bc7
c8d88b2
7d978f8
1ed3cce
c8d88b2
1ed3cce
c8d88b2
7d978f8
c8d88b2
 
7d978f8
c8d88b2
6dee028
c8d88b2
7d978f8
c8d88b2
 
7d978f8
c8d88b2
 
 
 
 
7d978f8
afa0a99
1ed3cce
7d978f8
1ed3cce
c8d88b2
1ed3cce
c8d88b2
1ed3cce
2270bc7
a6dfbcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2270bc7
cf10f44
 
20218cb
cf10f44
 
 
 
 
fe36699
cf10f44
 
20218cb
cf10f44
 
 
2270bc7
cf10f44
 
 
 
fe36699
cf10f44
61157d2
20218cb
61157d2
fe36699
cf10f44
 
20218cb
2270bc7
2a7ef32
 
 
 
 
fe36699
fd0dd62
 
 
2a7ef32
adec520
2a7ef32
 
 
 
 
9aee54a
2a7ef32
 
fe36699
d968fd4
9aee54a
49d0de6
d968fd4
 
 
 
 
 
 
5e8d963
d968fd4
 
 
61157d2
fe36699
5b4c39c
fe36699
cf10f44
 
 
 
 
61157d2
fe36699
 
cf10f44
7d978f8

import gradio as gr
import json
import os
import pdfplumber
import together
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"

# Use a sentence transformer for embeddings
#'BAAI/bge-base-en-v1.5'
# embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")  

# 'togethercomputer/m2-bert-80M-8k-retrieval'
embedding_model = SentenceTransformer(
    "togethercomputer/m2-bert-80M-8k-retrieval", 
    trust_remote_code=True  # Allow remote code execution
)

# Define dataset storage folder
DATASET_DIR = "/home/user/.cache/huggingface/datasets/my_documents"
os.makedirs(DATASET_DIR, exist_ok=True)  # Ensure directory exists

# Define file paths inside dataset folder
INDEX_FILE = os.path.join(DATASET_DIR, "faiss_index.bin")  # FAISS index file
METADATA_FILE = os.path.join(DATASET_DIR, "metadata.json")  # Metadata file

embedding_dim = 768  # Adjust according to model

# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_dim)

# Debugging: Check working directory and available files
print("Current working directory:", os.getcwd())
print("Files in dataset directory:", os.listdir(DATASET_DIR))

# Load FAISS index if it exists
if os.path.exists(INDEX_FILE):
    print(" FAISS index file exists")
    index = faiss.read_index(INDEX_FILE)
else:
    print(" No FAISS index found. Creating a new one.")
    index = faiss.IndexFlatL2(embedding_dim)  # Empty FAISS index

# Load metadata
if os.path.exists(METADATA_FILE):
    print(" Metadata file exists")
    with open(METADATA_FILE, "r") as f:
        metadata = json.load(f)
else:
    metadata = {}

def store_document(text):
    print(" Storing document...")

    # Generate a unique filename inside the dataset folder
    doc_id = len(metadata) + 1
    filename = os.path.join(DATASET_DIR, f"doc_{doc_id}.txt")
    print(f"Saving document at: {filename}")

    # Save document to file
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)
    print(" Document saved")

    # Generate and store embedding
    embedding = embedding_model.encode([text]).astype(np.float32)
    index.add(embedding)  # Add to FAISS index
    print(" Embeddings generated")

    # Get FAISS index for the new document
    doc_index = index.ntotal - 1

    # Update metadata with FAISS index
    metadata[str(doc_index)] = filename
    with open(METADATA_FILE, "w") as f:
        json.dump(metadata, f)
    print(" Saved Metadata")

    # Save FAISS index
    faiss.write_index(index, INDEX_FILE)
    print(" FAISS index saved")

    return f"Document stored at: {filename}"

def retrieve_document(query):
    print(f"Retrieving document based on:\n{query}")

    # Generate query embedding
    query_embedding = embedding_model.encode([query]).astype(np.float32)

    # Search for the closest document in FAISS index
    _, closest_idx = index.search(query_embedding, 1)

    # Check if a relevant document was found
    if closest_idx[0][0] == -1 or str(closest_idx[0][0]) not in metadata:
        print("No relevant document found")
        return None

    # Retrieve the document file path
    filename = metadata[str(closest_idx[0][0])]

    # Read and return the document content
    with open(filename, "r", encoding="utf-8") as f:
        return f.read()


def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        store_document(text)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(pdf_file, user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")

    if pdf_file:
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_file)
        if not text:
            return "Could not extract any text from the PDF."

    # retrieve the document relevant to the query
    doc = retrieve_document(user_question)           
    
    if doc:
        print(f"found doc{doc}")
        # Split into smaller chunks
        chunks = split_text(doc)
      
        # Use only the first chunk (to optimize token usage)
        prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
        print(f"prompt: \n{prompt}")
    else:
              prompt=user_question

    try:
            print("asking")
            response = together.Completion.create(
                model="mistralai/Mistral-7B-Instruct-v0.1",
                prompt=prompt,
                max_tokens=200,
                temperature=0.7,
            )
        
            # Return chatbot's response
            return response.choices[0].text
    except  Exception as e:
        return f"Error generating response: {e}"
        
    # Send to Together.AI (Mistral-7B)



# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="PDF Q&A Chatbot (Powered by Together.AI)"
)

# Launch Gradio app
iface.launch()