File size: 7,097 Bytes
fe36699
6dee028
924bf1b
7fa1089
cf10f44
85f7f2f
61157d2
c6d3428
2270bc7
 
92757b3
 
9212ca3
fecb931
9212ca3
 
bdab5b0
 
 
 
85f7f2f
24060f0
 
 
61157d2
a5bd9c0
fe36699
84b4386
4468b37
 
 
 
 
 
 
 
1ed3cce
84b4386
2270bc7
fecb931
7c0523d
c8d88b2
7d978f8
5c2668d
fecb931
 
 
 
be1a3b5
 
 
85f7f2f
be1a3b5
2319f8b
3a5682e
be1a3b5
85f7f2f
24060f0
cccc448
8504e03
2d5af5b
5848dd4
81c492e
e01694c
4784d20
e01694c
cbb907e
85f7f2f
 
7bccdcb
7f8ac14
fecb931
be1a3b5
2270bc7
a6dfbcd
 
 
e474e6a
 
2646d8d
053606e
 
 
 
 
 
 
 
 
b1fedda
 
6e7cfd2
b1fedda
 
 
 
dd8ef37
9961d18
2af8ac7
 
 
 
 
 
a5f4249
 
81646ae
 
 
a5f4249
 
053606e
 
a5f4249
053606e
a6dfbcd
 
 
 
 
 
bdab5b0
 
 
a6dfbcd
 
 
 
 
 
 
 
bdab5b0
 
a6dfbcd
 
 
 
cf10f44
 
20218cb
cf10f44
 
2646d8d
cf10f44
 
fe36699
cf10f44
 
b1af2d3
cf10f44
 
 
 
 
01ba052
cf10f44
fe36699
cf10f44
61157d2
20218cb
61157d2
fe36699
12af8e8
cf10f44
20218cb
fecb931
fd0dd62
 
 
2a7ef32
b5dee8e
2a7ef32
 
 
 
 
b5dee8e
2a7ef32
 
fe36699
d968fd4
9aee54a
49d0de6
d968fd4
 
 
 
 
 
 
5e8d963
d968fd4
 
 
61157d2
fe36699
5ae6760
 
fe36699
cf10f44
b26c7d3
 
 
 
3837c7b
b26c7d3
 
 
 
 
 
 
fecb931
 
 
592dcd0
fecb931
 
 
b26c7d3
5f799ae
b26c7d3
cf10f44
01ba052
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import gradio as gr
import json
import os
import io
import pdfplumber
import requests
import together
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import unicodedata
from dotenv import load_dotenv
from flask import jsonify

load_dotenv()
API_URL = "https://e4e5-196-96-202-255.ngrok-free.app"
API_URL_FILES = f"{API_URL}/file"
API_URL_EMBEDDINGS = f"{API_URL}/embeddings"
API_URL_METADATA = f"{API_URL}/metadata"

# FAISS index setup
DIM = 768  # Adjust based on the embedding model

# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"

# Use a sentence transformer for embeddings
#'BAAI/bge-base-en-v1.5'
# embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")  

# 'togethercomputer/m2-bert-80M-8k-retrieval'
embedding_model = SentenceTransformer(
    "togethercomputer/m2-bert-80M-8k-retrieval", 
    trust_remote_code=True  # Allow remote code execution
)

embedding_dim = 768  # Adjust according to model


def store_document_data(PDF_FILE):
    print(" Storing document...")

    if PDF_FILE:
        # Extract text from the PDF
        text = extract_text_from_pdf(PDF_FILE)
        if not text:
            return "Could not extract any text from the PDF."

        # Generate and return embedding
        embedding = embedding_model.encode([text]).astype(np.float32)
        
        print("Embeddings generated")
        print("Embedding shape:", embedding.shape)
        print(f"sending to {API_URL_EMBEDDINGS}")

        try:
            index = faiss.IndexFlatL2(embedding.shape[1])
            index.add(embedding)  # Add embedding
            print(index, index.ntotal)
            index_file = "index.bin"
            faiss.write_index(index, index_file)
            doc_index = index.ntotal - 1
            with open(index_file, "rb") as f:
                response = requests.post(API_URL_EMBEDDINGS, files={"file": f})

            print("sent")
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

        return doc_index
    else:
        return "No PDF file provided."

def retrieve_document(query):
    print(f"Retrieving document based on:\n{query}")

    embeddings_ = requests.get(API_URL_EMBEDDINGS)
    metadata_ = requests.get(API_URL_METADATA)

        # Check for errors before parsing JSON
    if embeddings_.status_code != 200:
        print(f"Error fetching embeddings: {embeddings_.status_code} - {embeddings_.text}")
        return None
    
    if metadata_.status_code != 200:
        print(f"Error fetching metadata: {metadata_.status_code} - {metadata_.text}")
        return None
    
    try:
        metadata_file = metadata_.json()
        print(metadata_file)
    except requests.exceptions.JSONDecodeError as e:
        print(f"Error decoding metadata JSON: {e}")
        return None

    try:
        print(embeddings_.content)
       # Convert response content to a byte stream
        byte_stream = io.BytesIO(embeddings_.content)
        
        # Load FAISS index from byte stream
        index = faiss.deserialize_index(byte_stream.read())

        print(f"Successfully loaded FAISS index with {index.ntotal} vectors.")

        # Now you can perform retrieval using `index.search()`
        # return index

    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        return None

    print(index, metadata_file)

    # Generate query embedding
    query_embedding = embedding_model.encode([query]).astype(np.float32)

    # Search for the closest document in FAISS index
    _, closest_idx = index.search(query_embedding, 1)

    with open(metadata_file, "r") as f:
        metadata = [json.loads(line) for line in f]

    # Check if a relevant document was found
    if closest_idx[0][0] == -1 or str(closest_idx[0][0]) not in metadata:
        print("No relevant document found")
        return None

    # Retrieve the document file path
    filename = metadata[str(closest_idx[0][0])]

    pdf_file = requests.get(API_URL_FILES, filename)

    # Read and return the document content
    with open(filename, "r", encoding="utf-8") as f:
        return f.read()

def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        return text
    except Exception as e:
        print(f"Error extracting text: {e}{pdf_file}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")
   
    # retrieve the document relevant to the query
    doc = retrieve_document(user_question)           
    
    if doc:
        print(f"found doc:\n{doc}\n")
        # Split into smaller chunks
        chunks = split_text(doc)
      
        # Use only the first chunk (to optimize token usage)
        prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
        print(f"prompt:\n{prompt}")
    else:
              prompt=user_question

    try:
            print("asking")
            response = together.Completion.create(
                model="mistralai/Mistral-7B-Instruct-v0.1",
                prompt=prompt,
                max_tokens=200,
                temperature=0.7,
            )
        
            # Return chatbot's response
            return response.choices[0].text
    except  Exception as e:
        return f"Error generating response: {e}"
        
    # Send to Together.AI (Mistral-7B)

def helloWorld(text):
    return f"{text} : hello world"

# Gradio Interface
iface = gr.TabbedInterface(
    [
        gr.Interface(
            fn=chatbot,
            inputs=gr.Textbox(label="Ask a Question"),
            outputs=gr.Textbox(label="Answer"),
            title="PDF Q&A Chatbot (Powered by Together.AI)",
        ),
        gr.Interface(
            fn=helloWorld,
            inputs="text",
            outputs="text",
        ),
        gr.Interface(
            fn=store_document_data,
            inputs=[gr.File(label="PDF_FILE")],
            outputs=gr.Textbox(label="Answer"),
            title="pdf file, metadata, index parsing and storing",
        ),
    ]
)
        
# Launch Gradio app
iface.launch(show_error=True)