Rafa1986's picture
Update app.py
fbe2154 verified
raw
history blame
3.74 kB
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import zipfile
from io import BytesIO
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_zip(zip_file):
"""Extracts PDF, TXT, and CSV files from a ZIP archive."""
extracted_files = {"pdf": [], "txt": [], "csv": []}
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file_name in zip_ref.namelist():
with zip_ref.open(file_name) as file:
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(BytesIO(file.read()))
elif file_name.endswith(".txt"):
extracted_files["txt"].append(BytesIO(file.read()))
elif file_name.endswith(".csv"):
extracted_files["csv"].append(BytesIO(file.read()))
return extracted_files
def get_text_from_pdf(pdf_files):
text = ""
for pdf in pdf_files:
reader = PyPDF2.PdfReader(pdf)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def get_text_from_txt(txt_files):
text = ""
for txt in txt_files:
text += txt.read().decode("utf-8") + "\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv in csv_files:
df = pd.read_csv(csv)
text += df.to_string() + "\n"
return text
def create_vector_database(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_texts(texts, embeddings)
return vector_db
def get_answer(question, vector_db):
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(question)
if not docs:
return "I could not find the answer in the documents. Do you want me to search external sources?"
context = "\n".join([doc.page_content for doc in docs])
language = detect_language(question)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions."},
{"role": "user", "content": question + "\n\nBased on the following context:\n" + context}
]
)
return response["choices"][0]["message"]["content"]
def chatbot_interface(zip_file, question):
text = ""
if zip_file:
extracted_files = extract_files_from_zip(zip_file)
text += get_text_from_pdf(extracted_files["pdf"])
text += get_text_from_txt(extracted_files["txt"])
text += get_text_from_csv(extracted_files["csv"])
if not text:
return "Please upload a ZIP file containing PDFs, TXTs, or CSVs before asking questions."
vector_db = create_vector_database(text)
return get_answer(question, vector_db)
# Gradio interface
demo = gr.Interface(
fn=chatbot_interface,
inputs=[gr.File(file_types=[".zip"]),
gr.Textbox(placeholder="Type your question here...")],
outputs=gr.Textbox()
)
demo.launch()