File size: 3,738 Bytes
91c1e78
65881ce
 
 
 
fbe2154
 
0530099
 
 
91c1e78
65881ce
 
 
 
 
 
 
 
 
 
91c1e78
65881ce
 
91c1e78
fbe2154
 
 
 
 
 
 
 
 
 
 
 
 
 
65881ce
 
 
 
 
 
 
91c1e78
65881ce
 
 
 
 
91c1e78
65881ce
 
 
 
 
 
91c1e78
65881ce
 
 
 
 
 
91c1e78
65881ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c1e78
fbe2154
65881ce
fbe2154
 
 
 
 
65881ce
 
fbe2154
65881ce
 
 
91c1e78
65881ce
 
 
fbe2154
65881ce
 
91c1e78
 
65881ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import zipfile
from io import BytesIO
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI

def detect_language(text):
    """Detects the language of the input text using OpenAI."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Detect the language of this text."},
            {"role": "user", "content": text}
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"

def extract_files_from_zip(zip_file):
    """Extracts PDF, TXT, and CSV files from a ZIP archive."""
    extracted_files = {"pdf": [], "txt": [], "csv": []}
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        for file_name in zip_ref.namelist():
            with zip_ref.open(file_name) as file:
                if file_name.endswith(".pdf"):
                    extracted_files["pdf"].append(BytesIO(file.read()))
                elif file_name.endswith(".txt"):
                    extracted_files["txt"].append(BytesIO(file.read()))
                elif file_name.endswith(".csv"):
                    extracted_files["csv"].append(BytesIO(file.read()))
    return extracted_files

def get_text_from_pdf(pdf_files):
    text = ""
    for pdf in pdf_files:
        reader = PyPDF2.PdfReader(pdf)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def get_text_from_txt(txt_files):
    text = ""
    for txt in txt_files:
        text += txt.read().decode("utf-8") + "\n"
    return text

def get_text_from_csv(csv_files):
    text = ""
    for csv in csv_files:
        df = pd.read_csv(csv)
        text += df.to_string() + "\n"
    return text

def create_vector_database(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = splitter.split_text(text)
    embeddings = OpenAIEmbeddings()
    vector_db = FAISS.from_texts(texts, embeddings)
    return vector_db

def get_answer(question, vector_db):
    retriever = vector_db.as_retriever()
    docs = retriever.get_relevant_documents(question)
    
    if not docs:
        return "I could not find the answer in the documents. Do you want me to search external sources?"
    
    context = "\n".join([doc.page_content for doc in docs])
    language = detect_language(question)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions."},
            {"role": "user", "content": question + "\n\nBased on the following context:\n" + context}
        ]
    )
    return response["choices"][0]["message"]["content"]

def chatbot_interface(zip_file, question):
    text = ""
    if zip_file:
        extracted_files = extract_files_from_zip(zip_file)
        text += get_text_from_pdf(extracted_files["pdf"])
        text += get_text_from_txt(extracted_files["txt"])
        text += get_text_from_csv(extracted_files["csv"])
    
    if not text:
        return "Please upload a ZIP file containing PDFs, TXTs, or CSVs before asking questions."
    
    vector_db = create_vector_database(text)
    return get_answer(question, vector_db)

# Gradio interface
demo = gr.Interface(
    fn=chatbot_interface,
    inputs=[gr.File(file_types=[".zip"]),
            gr.Textbox(placeholder="Type your question here...")],
    outputs=gr.Textbox()
)

demo.launch()