Spaces:
Running
Running
Update app.py
Browse filesupdated the code to include FAISS and a transformer
app.py
CHANGED
@@ -2,6 +2,10 @@ import gradio as gr
|
|
2 |
import os
|
3 |
import pdfplumber
|
4 |
import together
|
|
|
|
|
|
|
|
|
5 |
import re
|
6 |
import unicodedata
|
7 |
from dotenv import load_dotenv
|
@@ -11,6 +15,40 @@ load_dotenv()
|
|
11 |
# Set up Together.AI API Key (Replace with your actual key)
|
12 |
assert os.getenv("TOGETHER_API_KEY"), "api key missing"
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def clean_text(text):
|
15 |
"""Cleans extracted text for better processing by the model."""
|
16 |
print("cleaning")
|
@@ -26,6 +64,7 @@ def extract_text_from_pdf(pdf_file):
|
|
26 |
try:
|
27 |
with pdfplumber.open(pdf_file) as pdf:
|
28 |
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
|
|
|
29 |
return text
|
30 |
except Exception as e:
|
31 |
print(f"Error extracting text: {e}")
|
@@ -44,9 +83,12 @@ def chatbot(pdf_file, user_question):
|
|
44 |
text = extract_text_from_pdf(pdf_file)
|
45 |
if not text:
|
46 |
return "Could not extract any text from the PDF."
|
47 |
-
|
|
|
|
|
|
|
48 |
# Split into smaller chunks
|
49 |
-
chunks = split_text(
|
50 |
|
51 |
# Use only the first chunk (to optimize token usage)
|
52 |
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
|
|
|
2 |
import os
|
3 |
import pdfplumber
|
4 |
import together
|
5 |
+
from transformers import pipeline
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import faiss
|
8 |
+
import numpy as np
|
9 |
import re
|
10 |
import unicodedata
|
11 |
from dotenv import load_dotenv
|
|
|
15 |
# Set up Together.AI API Key (Replace with your actual key)
|
16 |
assert os.getenv("TOGETHER_API_KEY"), "api key missing"
|
17 |
|
18 |
+
|
19 |
+
# Load LLaMA-2 Model
|
20 |
+
llama_pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")
|
21 |
+
|
22 |
+
# Load Sentence Transformer for embeddings
|
23 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
24 |
+
|
25 |
+
# Initialize FAISS index
|
26 |
+
embedding_dim = 384 # For MiniLM model
|
27 |
+
index = faiss.IndexFlatL2(embedding_dim)
|
28 |
+
documents = [] # Store raw text for reference
|
29 |
+
|
30 |
+
def store_document(text):
|
31 |
+
print("storing document")
|
32 |
+
|
33 |
+
embedding = embedding_model.encode([text])
|
34 |
+
index.add(np.array(embedding, dtype=np.float32))
|
35 |
+
documents.append(text)
|
36 |
+
|
37 |
+
print(f"your document has been stored: \n{documents}")
|
38 |
+
|
39 |
+
return "Document stored!"
|
40 |
+
|
41 |
+
def retrieve_document(query):
|
42 |
+
print(f"retrieving doc based on {query}")
|
43 |
+
|
44 |
+
query_embedding = embedding_model.encode([query])
|
45 |
+
_, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
|
46 |
+
|
47 |
+
print(f"retrieved: {documents[closest_idx[0][0]}")
|
48 |
+
|
49 |
+
return documents[closest_idx[0][0]]
|
50 |
+
|
51 |
+
|
52 |
def clean_text(text):
|
53 |
"""Cleans extracted text for better processing by the model."""
|
54 |
print("cleaning")
|
|
|
64 |
try:
|
65 |
with pdfplumber.open(pdf_file) as pdf:
|
66 |
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
|
67 |
+
store_document(text)
|
68 |
return text
|
69 |
except Exception as e:
|
70 |
print(f"Error extracting text: {e}")
|
|
|
83 |
text = extract_text_from_pdf(pdf_file)
|
84 |
if not text:
|
85 |
return "Could not extract any text from the PDF."
|
86 |
+
|
87 |
+
# retrieve the document relevant to the query
|
88 |
+
doc = retrieve_document(user_question)
|
89 |
+
|
90 |
# Split into smaller chunks
|
91 |
+
chunks = split_text(doc)
|
92 |
|
93 |
# Use only the first chunk (to optimize token usage)
|
94 |
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
|