hancav commited on
Commit
53431b4
·
1 Parent(s): 0e797d5

create app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -0
app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """## Import necessary libraries"""
2
+ import os
3
+ import shutil
4
+ import json
5
+ from langchain.document_loaders import PyPDFLoader
6
+ from langchain.document_loaders import PyPDFDirectoryLoader
7
+ from langchain.llms import OpenAI
8
+ from langchain.prompts import PromptTemplate
9
+ from langchain.chains import LLMChain
10
+ from langchain.output_parsers import PydanticOutputParser
11
+ from pydantic import BaseModel, Field
12
+ from langchain.document_loaders import YoutubeLoader
13
+ from langchain.document_loaders import WebBaseLoader
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.embeddings.openai import OpenAIEmbeddings
17
+ from langchain.vectorstores import Chroma
18
+ from langchain.chat_models import ChatOpenAI
19
+ from langchain.chains import RetrievalQA
20
+ #from google.colab import drive
21
+ from google.oauth2 import service_account
22
+ from google.cloud import translate_v2 as translate
23
+ import gradio as gr
24
+
25
+ """## Access KEY"""
26
+ #ACCESS_KEY = os.environ.get("ACCESS_KEY")
27
+ service_account_info = json.loads(os.environ.get("SERVICE_ACCOUNT_FILE"))
28
+ credentials = service_account.Credentials.from_service_account_info(service_account_info)
29
+
30
+ """ ## Load PDF """
31
+ class LoadPdf:
32
+
33
+ def __init__(self, pdf_file):
34
+ if not self.is_pdf_file(pdf_file):
35
+ raise gr.Error("Invalid file extension. Please load a PDF file")
36
+ self.pdf_file = pdf_file
37
+
38
+ def is_pdf_file(self, file_path):
39
+ _, file_extension = os.path.splitext(file_path)
40
+ return file_extension.lower() == ".pdf"
41
+
42
+ def read_file(self):
43
+ loader = PyPDFLoader(self.pdf_file)
44
+ data = loader.load()
45
+ return data
46
+
47
+ """## Request OpenAI"""
48
+ class QuestionAnswer:
49
+
50
+ def __init__(self, data, question, user_key):
51
+ self.data = data
52
+ self.question = question
53
+ self.key = user_key
54
+
55
+ def make_qa(self):
56
+ #Splitter
57
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
58
+ splits = text_splitter.split_documents(self.data)
59
+ #Persist dir
60
+ persist_directory = 'files/chroma/'
61
+ #EMbedings
62
+ embedding = OpenAIEmbeddings(openai_api_key=self.key)
63
+ retriever = Chroma.from_documents(documents=splits,
64
+ embedding=embedding,
65
+ persist_directory=persist_directory).as_retriever()
66
+
67
+ # initialize the LLM
68
+ llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo-16k", openai_api_key=self.key)
69
+ question_answer = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
70
+
71
+ make_question = f'{self.question}'
72
+
73
+ return question_answer.run(make_question)
74
+
75
+ """## Translation"""
76
+ class TranslateOutput:
77
+
78
+ def __init__(self, credentials):
79
+ self.credentials = credentials
80
+
81
+ def list_languages(self):
82
+ client = translate.client.Client(credentials=self.credentials)
83
+ languages = client.get_languages()
84
+ language_names = [language['name'] for language in languages]
85
+ return language_names
86
+
87
+ def all_languages(self):
88
+ client = translate.client.Client(credentials=self.credentials)
89
+ languages = client.get_languages()
90
+ return languages
91
+
92
+ def translate_text(self, text, target_language):
93
+ client = translate.client.Client(target_language=target_language, credentials=self.credentials)
94
+
95
+ if isinstance(text, bytes):
96
+ text = text.decode("utf-8")
97
+
98
+ result = client.translate(text, target_language=target_language)
99
+ return result["translatedText"]
100
+
101
+ """## Run QA """
102
+ def run_qa(files,checkboxes,question,language,user_key):
103
+
104
+ #secret_key = os.environ.get("SECRET_KEY")
105
+ if user_key is None:
106
+ return 'Introduza OpenAI API KEY'
107
+
108
+ full_filenames = [file.name for file in files]
109
+ available_files = [os.path.basename(path) for path in full_filenames]
110
+ chosen_files = checkboxes
111
+
112
+ # Filter files that are both available and chosen
113
+ loadable_files = [file for file in available_files if file in chosen_files]
114
+
115
+ # debug messages
116
+ print(f"=> Available Files: {str(available_files)}")
117
+ print(f"=> Chosen Files: {str(chosen_files)}")
118
+ print(f"=> Question for Files: {str(question)}")
119
+ print(f"=> Language to use: {str(language)}")
120
+
121
+ # clear data
122
+ data=''
123
+ # Load files
124
+ for file in loadable_files:
125
+ print(f"=> Loading chosen file: {str(file)}")
126
+ pdf_loader = LoadPdf("pdfs/"+file)
127
+ data = pdf_loader.read_file()
128
+
129
+ # Run the model
130
+ qa = QuestionAnswer(data, question, user_key)
131
+ answer_open_ai = qa.make_qa()
132
+
133
+ # Translate output
134
+ language_selected = language
135
+ translate_output = TranslateOutput(credentials)
136
+
137
+ for i in translate_output.all_languages():
138
+ if i['name'] == language_selected:
139
+ iso_code = i['language']
140
+ break
141
+
142
+ print(f"=> Answer OpenAI: {answer_open_ai}")
143
+ print(f"=> Target Language IsoCode: {iso_code}")
144
+
145
+ answer = translate_output.translate_text(answer_open_ai, target_language=iso_code)
146
+ print(f"=> Translated Answer OpenAI: {answer}")
147
+
148
+ return answer
149
+
150
+ # Define a function to be called when files are uploaded
151
+ def on_files_upload(files):
152
+ # save files to files dir
153
+ if not os.path.exists("pdfs"):
154
+ os.makedirs("pdfs", exist_ok=True)
155
+ # print(f"The directory 'pdfs' was created!");
156
+ files_dir = "pdfs"
157
+ for fileobj in files:
158
+ path = files_dir + "/" + os.path.basename(fileobj)
159
+ shutil.copyfile(fileobj.name, path)
160
+ # checkbox group update
161
+ full_filenames = [file.name for file in files]
162
+ filenames = [os.path.basename(path) for path in full_filenames]
163
+ return(gr.CheckboxGroup(choices=filenames))
164
+
165
+ # Define a function to be called when files are cleared
166
+ def on_files_cleared():
167
+ if os.path.exists("pdfs"):
168
+ shutil.rmtree("pdfs")
169
+ # print(f"The directory was removed!");
170
+ return(gr.CheckboxGroup(choices=[]))
171
+
172
+ # Define the Gradio interface
173
+ title = "Deep Learning - Natural Language Processing"
174
+ subtitle = "Questão e Resposta assistida por LLMs sobre documentos PDF"
175
+ authors = "Hugo Cavalaria | Nuno Seiça | Ricardo Neves | Wilton Nagase"
176
+ custom_layout = "<h1>{}</h1><h2>{}</h2><p>{}</p>".format(title,subtitle,authors)
177
+
178
+ # Get the list of languages available
179
+ translate_output = TranslateOutput(credentials)
180
+ language_names = [i for i in translate_output.list_languages()]
181
+
182
+ # Gradio Interface
183
+ with gr.Blocks() as interface:
184
+ with gr.Row():
185
+ with gr.Column(scale=2):
186
+ gr.HTML(custom_layout)
187
+
188
+ with gr.Row():
189
+ with gr.Column(scale=1):
190
+ upload_pdfs = gr.Files(label="Fazer upload de ficheiros PDF", interactive=True, file_types=['.pdf'], container=True)
191
+ checkbox_group = gr.CheckboxGroup(label="Selecionar os ficheiros a utilizar.", choices=[], interactive=True)
192
+ question_text = gr.Textbox(label="Pergunta:")
193
+ answer_language = gr.Dropdown(label="Selecionar uma linguagem para tradução da resposta.", choices=language_names, value="Portuguese")
194
+ secret_key = gr.Textbox(label="OpenAI API Key:")
195
+ with gr.Column(scale=1):
196
+ output_status = gr.Textbox(label="Resposta:")
197
+
198
+ btn = gr.Button("Perguntar")
199
+
200
+ btn.click(fn=run_qa,
201
+ inputs=[upload_pdfs,checkbox_group,question_text,answer_language,secret_key],
202
+ outputs=[output_status])
203
+
204
+ upload_pdfs.upload(fn=on_files_upload,
205
+ inputs=[upload_pdfs],
206
+ outputs=[checkbox_group],
207
+ show_progress="full")
208
+
209
+ upload_pdfs.clear(fn=on_files_cleared,
210
+ inputs=None,
211
+ outputs=[checkbox_group])
212
+
213
+ """## Launch Interface"""
214
+ # launch interface
215
+ if __name__ == "__main__":
216
+ interface.launch(share=False, debug=True)