Rafa1986's picture
Update app.py
5baddf7 verified
raw
history blame
4.64 kB
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import zipfile
from io import BytesIO
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_zip(zip_path):
"""Extracts PDF, TXT, and CSV files from a ZIP archive, including subfolders."""
extracted_files = {"pdf": [], "txt": [], "csv": []}
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for file_name in zip_ref.namelist():
if file_name.endswith(('.pdf', '.txt', '.csv')):
with zip_ref.open(file_name) as file:
content = file.read()
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(BytesIO(content))
elif file_name.endswith(".txt"):
extracted_files["txt"].append(BytesIO(content))
elif file_name.endswith(".csv"):
extracted_files["csv"].append(BytesIO(content))
return extracted_files
def analyze_text(text):
"""Uses OpenAI to analyze notes, links, and complementary information in the text."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Analyze this document and extract key points, links, and complementary information."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
def get_text_from_pdf(pdf_files):
text = ""
for pdf in pdf_files:
reader = PyPDF2.PdfReader(pdf)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def get_text_from_txt(txt_files):
text = ""
for txt in txt_files:
text += txt.read().decode("utf-8") + "\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv in csv_files:
df = pd.read_csv(csv)
text += df.to_string() + "\n"
return text
def create_vector_database(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_texts(texts, embeddings)
return vector_db
def get_answer(question, vector_db, analysis):
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(question)
if not docs:
return "I could not find the answer in the documents. Do you want me to search external sources?"
context = "\n".join([doc.page_content for doc in docs])
language = detect_language(question)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents and their analyses to answer questions."},
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nAdditional insights:\n" + analysis}
]
)
return response["choices"][0]["message"]["content"]
def chatbot_interface(zip_file_path, question):
if not zip_file_path:
return "Please upload a ZIP file before asking a question."
extracted_files = extract_files_from_zip(zip_file_path)
text = get_text_from_pdf(extracted_files["pdf"]) + get_text_from_txt(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
if not text:
return "The ZIP file does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
analysis = analyze_text(text)
vector_db = create_vector_database(text)
return get_answer(question, vector_db, analysis)
# Gradio interface
demo = gr.Interface(
fn=chatbot_interface,
inputs=[gr.File(label="Upload ZIP File"),
gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
outputs=gr.Textbox(label="Answer")
)
demo.launch()