Spaces:
Running
Running
File size: 6,072 Bytes
fe36699 6dee028 924bf1b cf10f44 61157d2 2270bc7 92757b3 9212ca3 fe36699 61157d2 a5bd9c0 fe36699 84b4386 4468b37 1ed3cce 84b4386 2270bc7 1ed3cce 4ce21a8 c8d88b2 adec520 c8d88b2 adec520 7d978f8 c8d88b2 1ed3cce 7d978f8 2270bc7 c8d88b2 7d978f8 1ed3cce c8d88b2 1ed3cce c8d88b2 7d978f8 c8d88b2 7d978f8 c8d88b2 6dee028 c8d88b2 7d978f8 c8d88b2 7d978f8 c8d88b2 7d978f8 afa0a99 1ed3cce 7d978f8 1ed3cce c8d88b2 1ed3cce c8d88b2 1ed3cce 2270bc7 a6dfbcd 2270bc7 cf10f44 20218cb cf10f44 fe36699 cf10f44 20218cb cf10f44 2270bc7 cf10f44 fe36699 cf10f44 61157d2 20218cb 61157d2 fe36699 cf10f44 20218cb 2270bc7 2a7ef32 fe36699 fd0dd62 2a7ef32 adec520 2a7ef32 9aee54a 2a7ef32 fe36699 d968fd4 9aee54a 49d0de6 d968fd4 5e8d963 d968fd4 61157d2 fe36699 5b4c39c fe36699 cf10f44 61157d2 fe36699 cf10f44 7d978f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import gradio as gr
import json
import os
import pdfplumber
import together
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import unicodedata
from dotenv import load_dotenv
load_dotenv()
# Set up Together.AI API Key (Replace with your actual key)
assert os.getenv("TOGETHER_API_KEY"), "api key missing"
# Use a sentence transformer for embeddings
#'BAAI/bge-base-en-v1.5'
# embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
# 'togethercomputer/m2-bert-80M-8k-retrieval'
embedding_model = SentenceTransformer(
"togethercomputer/m2-bert-80M-8k-retrieval",
trust_remote_code=True # Allow remote code execution
)
# Define dataset storage folder
DATASET_DIR = "/home/user/.cache/huggingface/datasets/my_documents"
os.makedirs(DATASET_DIR, exist_ok=True) # Ensure directory exists
# Define file paths inside dataset folder
INDEX_FILE = os.path.join(DATASET_DIR, "faiss_index.bin") # FAISS index file
METADATA_FILE = os.path.join(DATASET_DIR, "metadata.json") # Metadata file
embedding_dim = 768 # Adjust according to model
# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_dim)
# Debugging: Check working directory and available files
print("Current working directory:", os.getcwd())
print("Files in dataset directory:", os.listdir(DATASET_DIR))
# Load FAISS index if it exists
if os.path.exists(INDEX_FILE):
print(" FAISS index file exists")
index = faiss.read_index(INDEX_FILE)
else:
print(" No FAISS index found. Creating a new one.")
index = faiss.IndexFlatL2(embedding_dim) # Empty FAISS index
# Load metadata
if os.path.exists(METADATA_FILE):
print(" Metadata file exists")
with open(METADATA_FILE, "r") as f:
metadata = json.load(f)
else:
metadata = {}
def store_document(text):
print(" Storing document...")
# Generate a unique filename inside the dataset folder
doc_id = len(metadata) + 1
filename = os.path.join(DATASET_DIR, f"doc_{doc_id}.txt")
print(f"Saving document at: {filename}")
# Save document to file
with open(filename, "w", encoding="utf-8") as f:
f.write(text)
print(" Document saved")
# Generate and store embedding
embedding = embedding_model.encode([text]).astype(np.float32)
index.add(embedding) # Add to FAISS index
print(" Embeddings generated")
# Get FAISS index for the new document
doc_index = index.ntotal - 1
# Update metadata with FAISS index
metadata[str(doc_index)] = filename
with open(METADATA_FILE, "w") as f:
json.dump(metadata, f)
print(" Saved Metadata")
# Save FAISS index
faiss.write_index(index, INDEX_FILE)
print(" FAISS index saved")
return f"Document stored at: {filename}"
def retrieve_document(query):
print(f"Retrieving document based on:\n{query}")
# Generate query embedding
query_embedding = embedding_model.encode([query]).astype(np.float32)
# Search for the closest document in FAISS index
_, closest_idx = index.search(query_embedding, 1)
# Check if a relevant document was found
if closest_idx[0][0] == -1 or str(closest_idx[0][0]) not in metadata:
print("No relevant document found")
return None
# Retrieve the document file path
filename = metadata[str(closest_idx[0][0])]
# Read and return the document content
with open(filename, "r", encoding="utf-8") as f:
return f.read()
def clean_text(text):
"""Cleans extracted text for better processing by the model."""
print("cleaning")
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\"()\-]', ' ', text) # Keep basic punctuation
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
return text
def extract_text_from_pdf(pdf_file):
"""Extract and clean text from the uploaded PDF."""
print("extracting")
try:
with pdfplumber.open(pdf_file) as pdf:
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
store_document(text)
return text
except Exception as e:
print(f"Error extracting text: {e}")
return None
def split_text(text, chunk_size=500):
"""Splits text into smaller chunks for better processing."""
print("splitting")
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def chatbot(pdf_file, user_question):
"""Processes the PDF and answers the user's question."""
print("chatbot start")
if pdf_file:
# Extract text from the PDF
text = extract_text_from_pdf(pdf_file)
if not text:
return "Could not extract any text from the PDF."
# retrieve the document relevant to the query
doc = retrieve_document(user_question)
if doc:
print(f"found doc{doc}")
# Split into smaller chunks
chunks = split_text(doc)
# Use only the first chunk (to optimize token usage)
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
print(f"prompt: \n{prompt}")
else:
prompt=user_question
try:
print("asking")
response = together.Completion.create(
model="mistralai/Mistral-7B-Instruct-v0.1",
prompt=prompt,
max_tokens=200,
temperature=0.7,
)
# Return chatbot's response
return response.choices[0].text
except Exception as e:
return f"Error generating response: {e}"
# Send to Together.AI (Mistral-7B)
# Gradio Interface
iface = gr.Interface(
fn=chatbot,
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question")],
outputs=gr.Textbox(label="Answer"),
title="PDF Q&A Chatbot (Powered by Together.AI)"
)
# Launch Gradio app
iface.launch() |