Spaces:
Running
Running
import gradio as gr | |
from llama_index.core.readers import SimpleDirectoryReader | |
from llama_index.core import VectorStoreIndex, Document | |
from llama_index.core.node_parser import SentenceSplitter | |
from llama_index.core import Settings | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.llms.huggingface import HuggingFaceLLM | |
import csv | |
from docx import Document as DocxDocument | |
import fitz | |
# Define the list of LLMs with their names and models | |
lm_list = { | |
"google/gemma-2-9b-it": "Google Gemma 2.9B IT", | |
"mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct v0.3" | |
} | |
# Initialize the query engine globally | |
query_engine = None | |
def process_file(file): | |
file_extension = file.name.split(".")[-1].lower() | |
if file_extension == 'txt': | |
with open(file.name, 'r', encoding='utf-8') as f: | |
text = f.read() | |
elif file_extension == 'csv': | |
with open(file.name, 'r', encoding='utf-8') as f: | |
reader = csv.reader(f) | |
text = '\n'.join(','.join(row) for row in reader) | |
elif file_extension == 'pdf': | |
pdf_document = fitz.open(file.name, filetype=file_extension) | |
text = "" | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
text += page.get_text("text") | |
pdf_document.close() | |
elif file_extension == 'docx': | |
docx_document = DocxDocument(file.name) | |
text = "" | |
for paragraph in docx_document.paragraphs: | |
text += paragraph.text + "\n" | |
return [Document(text=text)] | |
def handle_file_upload(file, llm_name): | |
global query_engine | |
Settings.llm = HuggingFaceLLM(model_name=llm_name) | |
documents = process_file(file) | |
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10) | |
Settings.embed_model = HuggingFaceEmbedding(model_name="nomic-embed-text:latest") | |
Settings.text_splitter = text_splitter | |
index = VectorStoreIndex.from_documents( | |
documents, transformations=[text_splitter], embed_model=Settings.embed_model | |
) | |
return index.as_query_engine() | |
def document_qa(file_upload, llm_choice, question_input): | |
query_engine = handle_file_upload(file_upload, llm_choice) | |
result = query_engine.query(question_input) | |
return str(result) | |
llm_choice = gr.Dropdown(choices=list(lm_list.values()), label="Choose LLM") | |
file_upload = gr.File(label="Upload Document") | |
question_input = gr.Textbox(label="Enter your question") | |
gr.Interface( | |
fn=document_qa, | |
inputs=[file_upload, llm_choice, question_input], | |
outputs=gr.Textbox(label="Answer"), | |
title="Document Question Answering", | |
description="Upload a document and choose a language model to get answers.", | |
allow_flagging=False | |
).launch() | |