Chan-Y commited on
Commit
23a3f70
·
verified ·
1 Parent(s): eae7319

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from llama_index.core.readers import SimpleDirectoryReader
3
+ from llama_index.core import VectorStoreIndex, Document
4
+ from llama_index.core.node_parser import SentenceSplitter
5
+ from llama_index.core import Settings
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ from llama_index.llms.huggingface import HuggingFaceLLM
8
+ import csv
9
+ from docx import Document as DocxDocument
10
+ import fitz
11
+
12
+ # Define the list of LLMs with their names and models
13
+ lm_list = {
14
+ "google/gemma-2-9b-it": "Google Gemma 2.9B IT",
15
+ "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct v0.3"
16
+ }
17
+
18
+ # Initialize the query engine globally
19
+ query_engine = None
20
+
21
+ def process_file(file):
22
+ file_extension = file.name.split(".")[-1].lower()
23
+
24
+ if file_extension == 'txt':
25
+ with open(file.name, 'r', encoding='utf-8') as f:
26
+ text = f.read()
27
+
28
+ elif file_extension == 'csv':
29
+ with open(file.name, 'r', encoding='utf-8') as f:
30
+ reader = csv.reader(f)
31
+ text = '\n'.join(','.join(row) for row in reader)
32
+
33
+ elif file_extension == 'pdf':
34
+ pdf_document = fitz.open(file.name, filetype=file_extension)
35
+ text = ""
36
+ for page_num in range(pdf_document.page_count):
37
+ page = pdf_document.load_page(page_num)
38
+ text += page.get_text("text")
39
+ pdf_document.close()
40
+
41
+ elif file_extension == 'docx':
42
+ docx_document = DocxDocument(file.name)
43
+ text = ""
44
+ for paragraph in docx_document.paragraphs:
45
+ text += paragraph.text + "\n"
46
+
47
+ return [Document(text=text)]
48
+
49
+ def handle_file_upload(file, llm_name):
50
+ global query_engine
51
+
52
+ Settings.llm = HuggingFaceLLM(model_name=llm_name)
53
+
54
+ documents = process_file(file)
55
+
56
+ text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)
57
+ Settings.embed_model = HuggingFaceEmbedding(model_name="nomic-embed-text:latest")
58
+ Settings.text_splitter = text_splitter
59
+ index = VectorStoreIndex.from_documents(
60
+ documents, transformations=[text_splitter], embed_model=Settings.embed_model
61
+ )
62
+
63
+ return index.as_query_engine()
64
+
65
+ def document_qa(file_upload, llm_choice, question_input):
66
+ query_engine = handle_file_upload(file_upload, llm_choice)
67
+ result = query_engine.query(question_input)
68
+ return str(result)
69
+
70
+
71
+ llm_choice = gr.Dropdown(choices=list(lm_list.values()), label="Choose LLM")
72
+ file_upload = gr.File(label="Upload Document")
73
+ question_input = gr.Textbox(label="Enter your question")
74
+
75
+ gr.Interface(
76
+ fn=document_qa,
77
+ inputs=[file_upload, llm_choice, question_input],
78
+ outputs=gr.Textbox(label="Answer"),
79
+ title="Document Question Answering",
80
+ description="Upload a document and choose a language model to get answers.",
81
+ allow_flagging=False
82
+ ).launch()