File size: 3,934 Bytes
fe36699
924bf1b
cf10f44
61157d2
2270bc7
 
 
 
42ada2f
92757b3
 
9212ca3
 
 
fe36699
61157d2
a5bd9c0
fe36699
10a8f5e
 
 
 
 
 
2270bc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a06984f
2270bc7
 
 
 
cf10f44
 
20218cb
cf10f44
 
 
 
 
fe36699
cf10f44
 
20218cb
cf10f44
 
 
2270bc7
cf10f44
 
 
 
fe36699
cf10f44
61157d2
20218cb
61157d2
fe36699
cf10f44
 
20218cb
cf10f44
61157d2
cf10f44
61157d2
 
2270bc7
 
 
 
61157d2
2270bc7
fe36699
61157d2
cf10f44
fe36699
d968fd4
49d0de6
d968fd4
 
 
 
 
 
 
5e8d963
d968fd4
 
 
61157d2
fe36699
5b4c39c
fe36699
cf10f44
 
 
 
 
61157d2
fe36699
 
cf10f44
5f6bee3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import os
import pdfplumber
import together
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import huggingface_hub as login
import re
import unicodedata
from dotenv import load_dotenv

load_dotenv()

# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"

# Retrieve the API token from secrets
api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

if api_token:
    login(api_token)  # Authenticate with Hugging Face


# Load LLaMA-2 Model
llama_pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")

# Load Sentence Transformer for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize FAISS index
embedding_dim = 384  # For MiniLM model
index = faiss.IndexFlatL2(embedding_dim)
documents = []  # Store raw text for reference

def store_document(text):
    print("storing document")
    
    embedding = embedding_model.encode([text])
    index.add(np.array(embedding, dtype=np.float32))
    documents.append(text)
    
    print(f"your document has been stored: \n{documents}")
    
    return "Document stored!"

def retrieve_document(query):
    print(f"retrieving doc based on {query}")
    
    query_embedding = embedding_model.encode([query])
    _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
    
    print(f"retrieved: {documents[closest_idx[0][0]]}")
    
    return documents[closest_idx[0][0]]


def clean_text(text):
    """Cleans extracted text for better processing by the model."""
    print("cleaning")
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'(?i)(page\s*\d+)', '', text)  # Remove page numbers
    return text

def extract_text_from_pdf(pdf_file):
    """Extract and clean text from the uploaded PDF."""
    print("extracting")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
        store_document(text)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def split_text(text, chunk_size=500):
    """Splits text into smaller chunks for better processing."""
    print("splitting")
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def chatbot(pdf_file, user_question):
    """Processes the PDF and answers the user's question."""
    print("chatbot start")
    
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_file)
    if not text:
        return "Could not extract any text from the PDF."

    # retrieve the document relevant to the query
    doc = retrieve_document(user_question)

    # Split into smaller chunks
    chunks = split_text(doc)

    # Use only the first chunk (to optimize token usage)
    prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"

    try:
            response = together.Completion.create(
                model="mistralai/Mistral-7B-Instruct-v0.1",
                prompt=prompt,
                max_tokens=200,
                temperature=0.7,
            )
        
            # Return chatbot's response
            return response.choices[0].text
    except  Exception as e:
        return f"Error generating response: {e}"
        
    # Send to Together.AI (Mistral-7B)



# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
    outputs=gr.Textbox(label="Answer"),
    title="PDF Q&A Chatbot (Powered by Together.AI)"
)

# Launch Gradio app
iface.launch()