Last commit not found
import gradio as gr | |
from langchain.document_loaders import PDFMinerLoader, PyMuPDFLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
import chromadb | |
import chromadb.config | |
from chromadb.config import Settings | |
import gradio as gr | |
def get_context(query_text): | |
query_emb = st_model.encode(query_text) | |
query_response = collection.query(query_embeddings=query_emb.tolist(), n_results=4) | |
context = query_response['documents'][0][0] | |
context = context.replace('\n', ' ').replace(' ', ' ') | |
return context | |
def local_query(query, context): | |
t5query = """Using the available context, please answer the question. | |
If you aren't sure please say i don't know. | |
Context: {} | |
Question: {} | |
""".format(context, query) | |
inputs = tokenizer(t5query, return_tensors="pt") | |
outputs = model.generate(**inputs, max_new_tokens=20) | |
return tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
def run_query(query): | |
context = get_context(query) | |
result = local_query(query, context) | |
return result | |
def upload_pdf(file): | |
# Save the uploaded file | |
file_name = file.name | |
pdf_filename = os.path.basename(file_path) | |
# Load a document | |
loader = PDFMinerLoader(pdf_filename) | |
doc = loader.load() | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
texts = text_splitter.split_documents(doc) | |
texts = [i.page_content for i in texts] | |
doc_emb = st_model.encode(texts) | |
doc_emb = doc_emb.tolist() | |
ids = [str(uuid.uuid1()) for _ in doc_emb] | |
client = chromadb.Client() | |
# Create collection. get_collection, get_or_create_collection, delete_collection also available! | |
collection = client.create_collection("test_db") | |
collection.add( | |
embeddings=doc_emb, | |
documents=texts, | |
ids=ids | |
) | |
return run_query("how to reduce waste?") | |
from transformers import T5ForConditionalGeneration, AutoTokenizer | |
import torch | |
model_name = 'google/flan-t5-base' | |
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map='auto', offload_folder="offload") | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
import uuid | |
from sentence_transformers import SentenceTransformer | |
ST_name = 'sentence-transformers/sentence-t5-base' | |
st_model = SentenceTransformer(ST_name) | |
iface = gr.Interface( | |
fn=upload_pdf, | |
inputs="file", | |
outputs="text", | |
title="PDF File Uploader", | |
description="Upload a PDF file and get its filename.", | |
) | |
iface.launch() | |