Spaces:

Penality
/

pdf-something

Sleeping

App Files Files Community

Penality commited on Feb 23

Commit

2270bc7

verified ·

1 Parent(s): f246008

Update app.py

Browse files

updated the code to include FAISS and a transformer

Files changed (1) hide show

app.py +44 -2

app.py CHANGED Viewed

@@ -2,6 +2,10 @@ import gradio as gr
 import os
 import pdfplumber
 import together
 import re
 import unicodedata
 from dotenv import load_dotenv
@@ -11,6 +15,40 @@ load_dotenv()
 # Set up Together.AI API Key (Replace with your actual key)
 assert os.getenv("TOGETHER_API_KEY"), "api key missing"
 def clean_text(text):
     """Cleans extracted text for better processing by the model."""
     print("cleaning")
@@ -26,6 +64,7 @@ def extract_text_from_pdf(pdf_file):
     try:
         with pdfplumber.open(pdf_file) as pdf:
             text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
         return text
     except Exception as e:
         print(f"Error extracting text: {e}")
@@ -44,9 +83,12 @@ def chatbot(pdf_file, user_question):
     text = extract_text_from_pdf(pdf_file)
     if not text:
         return "Could not extract any text from the PDF."
     # Split into smaller chunks
-    chunks = split_text(text)
     # Use only the first chunk (to optimize token usage)
     prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"

 import os
 import pdfplumber
 import together
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
 import re
 import unicodedata
 from dotenv import load_dotenv
 # Set up Together.AI API Key (Replace with your actual key)
 assert os.getenv("TOGETHER_API_KEY"), "api key missing"
+# Load LLaMA-2 Model
+llama_pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")
+# Load Sentence Transformer for embeddings
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Initialize FAISS index
+embedding_dim = 384  # For MiniLM model
+index = faiss.IndexFlatL2(embedding_dim)
+documents = []  # Store raw text for reference
+def store_document(text):
+    print("storing document")
+    embedding = embedding_model.encode([text])
+    index.add(np.array(embedding, dtype=np.float32))
+    documents.append(text)
+    print(f"your document has been stored: \n{documents}")
+    return "Document stored!"
+def retrieve_document(query):
+    print(f"retrieving doc based on {query}")
+    query_embedding = embedding_model.encode([query])
+    _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
+    print(f"retrieved: {documents[closest_idx[0][0]}")
+    return documents[closest_idx[0][0]]
 def clean_text(text):
     """Cleans extracted text for better processing by the model."""
     print("cleaning")
     try:
         with pdfplumber.open(pdf_file) as pdf:
             text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
+        store_document(text)
         return text
     except Exception as e:
         print(f"Error extracting text: {e}")
     text = extract_text_from_pdf(pdf_file)
     if not text:
         return "Could not extract any text from the PDF."
+    # retrieve the document relevant to the query
+    doc = retrieve_document(user_question)
     # Split into smaller chunks
+    chunks = split_text(doc)
     # Use only the first chunk (to optimize token usage)
     prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"