Penality commited on
Commit
2270bc7
·
verified ·
1 Parent(s): f246008

Update app.py

Browse files

updated the code to include FAISS and a transformer

Files changed (1) hide show
  1. app.py +44 -2
app.py CHANGED
@@ -2,6 +2,10 @@ import gradio as gr
2
  import os
3
  import pdfplumber
4
  import together
 
 
 
 
5
  import re
6
  import unicodedata
7
  from dotenv import load_dotenv
@@ -11,6 +15,40 @@ load_dotenv()
11
  # Set up Together.AI API Key (Replace with your actual key)
12
  assert os.getenv("TOGETHER_API_KEY"), "api key missing"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def clean_text(text):
15
  """Cleans extracted text for better processing by the model."""
16
  print("cleaning")
@@ -26,6 +64,7 @@ def extract_text_from_pdf(pdf_file):
26
  try:
27
  with pdfplumber.open(pdf_file) as pdf:
28
  text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
 
29
  return text
30
  except Exception as e:
31
  print(f"Error extracting text: {e}")
@@ -44,9 +83,12 @@ def chatbot(pdf_file, user_question):
44
  text = extract_text_from_pdf(pdf_file)
45
  if not text:
46
  return "Could not extract any text from the PDF."
47
-
 
 
 
48
  # Split into smaller chunks
49
- chunks = split_text(text)
50
 
51
  # Use only the first chunk (to optimize token usage)
52
  prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
 
2
  import os
3
  import pdfplumber
4
  import together
5
+ from transformers import pipeline
6
+ from sentence_transformers import SentenceTransformer
7
+ import faiss
8
+ import numpy as np
9
  import re
10
  import unicodedata
11
  from dotenv import load_dotenv
 
15
  # Set up Together.AI API Key (Replace with your actual key)
16
  assert os.getenv("TOGETHER_API_KEY"), "api key missing"
17
 
18
+
19
+ # Load LLaMA-2 Model
20
+ llama_pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")
21
+
22
+ # Load Sentence Transformer for embeddings
23
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
24
+
25
+ # Initialize FAISS index
26
+ embedding_dim = 384 # For MiniLM model
27
+ index = faiss.IndexFlatL2(embedding_dim)
28
+ documents = [] # Store raw text for reference
29
+
30
+ def store_document(text):
31
+ print("storing document")
32
+
33
+ embedding = embedding_model.encode([text])
34
+ index.add(np.array(embedding, dtype=np.float32))
35
+ documents.append(text)
36
+
37
+ print(f"your document has been stored: \n{documents}")
38
+
39
+ return "Document stored!"
40
+
41
+ def retrieve_document(query):
42
+ print(f"retrieving doc based on {query}")
43
+
44
+ query_embedding = embedding_model.encode([query])
45
+ _, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
46
+
47
+ print(f"retrieved: {documents[closest_idx[0][0]}")
48
+
49
+ return documents[closest_idx[0][0]]
50
+
51
+
52
  def clean_text(text):
53
  """Cleans extracted text for better processing by the model."""
54
  print("cleaning")
 
64
  try:
65
  with pdfplumber.open(pdf_file) as pdf:
66
  text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
67
+ store_document(text)
68
  return text
69
  except Exception as e:
70
  print(f"Error extracting text: {e}")
 
83
  text = extract_text_from_pdf(pdf_file)
84
  if not text:
85
  return "Could not extract any text from the PDF."
86
+
87
+ # retrieve the document relevant to the query
88
+ doc = retrieve_document(user_question)
89
+
90
  # Split into smaller chunks
91
+ chunks = split_text(doc)
92
 
93
  # Use only the first chunk (to optimize token usage)
94
  prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"