Spaces:
Sleeping
Sleeping
File size: 3,738 Bytes
91c1e78 65881ce fbe2154 0530099 91c1e78 65881ce 91c1e78 65881ce 91c1e78 fbe2154 65881ce 91c1e78 65881ce 91c1e78 65881ce 91c1e78 65881ce 91c1e78 65881ce 91c1e78 fbe2154 65881ce fbe2154 65881ce fbe2154 65881ce 91c1e78 65881ce fbe2154 65881ce 91c1e78 65881ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import zipfile
from io import BytesIO
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_zip(zip_file):
"""Extracts PDF, TXT, and CSV files from a ZIP archive."""
extracted_files = {"pdf": [], "txt": [], "csv": []}
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file_name in zip_ref.namelist():
with zip_ref.open(file_name) as file:
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(BytesIO(file.read()))
elif file_name.endswith(".txt"):
extracted_files["txt"].append(BytesIO(file.read()))
elif file_name.endswith(".csv"):
extracted_files["csv"].append(BytesIO(file.read()))
return extracted_files
def get_text_from_pdf(pdf_files):
text = ""
for pdf in pdf_files:
reader = PyPDF2.PdfReader(pdf)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def get_text_from_txt(txt_files):
text = ""
for txt in txt_files:
text += txt.read().decode("utf-8") + "\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv in csv_files:
df = pd.read_csv(csv)
text += df.to_string() + "\n"
return text
def create_vector_database(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_texts(texts, embeddings)
return vector_db
def get_answer(question, vector_db):
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(question)
if not docs:
return "I could not find the answer in the documents. Do you want me to search external sources?"
context = "\n".join([doc.page_content for doc in docs])
language = detect_language(question)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions."},
{"role": "user", "content": question + "\n\nBased on the following context:\n" + context}
]
)
return response["choices"][0]["message"]["content"]
def chatbot_interface(zip_file, question):
text = ""
if zip_file:
extracted_files = extract_files_from_zip(zip_file)
text += get_text_from_pdf(extracted_files["pdf"])
text += get_text_from_txt(extracted_files["txt"])
text += get_text_from_csv(extracted_files["csv"])
if not text:
return "Please upload a ZIP file containing PDFs, TXTs, or CSVs before asking questions."
vector_db = create_vector_database(text)
return get_answer(question, vector_db)
# Gradio interface
demo = gr.Interface(
fn=chatbot_interface,
inputs=[gr.File(file_types=[".zip"]),
gr.Textbox(placeholder="Type your question here...")],
outputs=gr.Textbox()
)
demo.launch()
|