File size: 4,054 Bytes
fe36699
924bf1b
cf10f44
61157d2
2270bc7
 
 
92757b3
 
9212ca3
 
 
fe36699
61157d2
a5bd9c0
fe36699
84b4386
 
 
2270bc7
 
 
 
 
 
 
 
 
9aee54a
2270bc7
 
 
9aee54a
2270bc7
 
 
 
9aee54a
2270bc7
 
 
 
9aee54a
2270bc7
 
 
 
cf10f44
 
20218cb
cf10f44
 
 
 
 
fe36699
cf10f44
 
20218cb
cf10f44
 
 
2270bc7
cf10f44
 
 
 
fe36699
cf10f44
61157d2
20218cb
61157d2
fe36699
cf10f44
 
20218cb
2270bc7
2a7ef32
 
 
 
 
fe36699
2a7ef32
 
 
 
 
 
 
9aee54a
2a7ef32
 
 
 
 
9aee54a
2a7ef32
 
fe36699
d968fd4
9aee54a
49d0de6
d968fd4
 
 
 
 
 
 
5e8d963
d968fd4
 
 
61157d2
fe36699
5b4c39c
fe36699
cf10f44
 
 
 
 
61157d2
fe36699
 
cf10f44
5f6bee3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import os
import pdfplumber
import together
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"

# Use a sentence transformer for embeddings
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")  # Alternative: 'togethercomputer/m2-bert-80M-8k-retrieval'
embedding_dim = 768  # Adjust according to model

# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_dim)
documents = []  # Store raw text for reference

def store_document(text):
    print("storing document")
    
    embedding = embedding_model.encode([text])
    print(f"embedding: \n{embedding}")
    index.add(np.array(embedding, dtype=np.float32))
    documents.append(text)
    
    print(f"your document has been stored")
    
    return "Document stored!"

def retrieve_document(query):
    print(f"retrieving doc based on: \n{query}")
    
    query_embedding = embedding_model.encode([query])
    _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
    
    print(f"retrieved: \n{documents[closest_idx[0][0]]}")
    
    return documents[closest_idx[0][0]]


def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        store_document(text)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(pdf_file, user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")

    if pdf_file:
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_file)
        if not text:
            return "Could not extract any text from the PDF."

    try:
        # retrieve the document relevant to the query
        doc = retrieve_document(user_question)           
    except Exception as e:
        return f"Error retrieving document relevant to the query: {user_question} \n{e}"

    if doc:
        print("found doc")
        # Split into smaller chunks
        chunks = split_text(doc)
      
        # Use only the first chunk (to optimize token usage)
        prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
        print(f"prompt: \n{prompt}")
    else:
              prompt=user_question

    try:
            print("asking")
            response = together.Completion.create(
                model="mistralai/Mistral-7B-Instruct-v0.1",
                prompt=prompt,
                max_tokens=200,
                temperature=0.7,
            )
        
            # Return chatbot's response
            return response.choices[0].text
    except  Exception as e:
        return f"Error generating response: {e}"
        
    # Send to Together.AI (Mistral-7B)



# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="PDF Q&A Chatbot (Powered by Together.AI)"
)

# Launch Gradio app
iface.launch()