File size: 4,641 Bytes
91c1e78
65881ce
 
 
 
fbe2154
 
0530099
 
 
5baddf7
 
91c1e78
65881ce
 
 
 
 
 
 
 
 
 
91c1e78
65881ce
 
91c1e78
5baddf7
 
fbe2154
5baddf7
 
fbe2154
5baddf7
 
 
 
 
 
 
 
 
fbe2154
 
5baddf7
 
 
 
 
 
 
 
 
 
 
65881ce
 
 
 
 
 
 
91c1e78
65881ce
 
 
 
 
91c1e78
65881ce
 
 
 
 
 
91c1e78
65881ce
 
 
 
 
 
91c1e78
5baddf7
65881ce
 
 
 
 
 
 
 
 
 
 
5baddf7
 
65881ce
 
 
91c1e78
5baddf7
 
 
 
 
 
65881ce
 
5baddf7
65881ce
5baddf7
65881ce
5baddf7
91c1e78
65881ce
 
 
5baddf7
 
 
91c1e78
 
65881ce
5baddf7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import zipfile
from io import BytesIO
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter


def detect_language(text):
    """Detects the language of the input text using OpenAI."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Detect the language of this text."},
            {"role": "user", "content": text}
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"

def extract_files_from_zip(zip_path):
    """Extracts PDF, TXT, and CSV files from a ZIP archive, including subfolders."""
    extracted_files = {"pdf": [], "txt": [], "csv": []}
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for file_name in zip_ref.namelist():
            if file_name.endswith(('.pdf', '.txt', '.csv')):
                with zip_ref.open(file_name) as file:
                    content = file.read()
                    if file_name.endswith(".pdf"):
                        extracted_files["pdf"].append(BytesIO(content))
                    elif file_name.endswith(".txt"):
                        extracted_files["txt"].append(BytesIO(content))
                    elif file_name.endswith(".csv"):
                        extracted_files["csv"].append(BytesIO(content))
    return extracted_files

def analyze_text(text):
    """Uses OpenAI to analyze notes, links, and complementary information in the text."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Analyze this document and extract key points, links, and complementary information."},
            {"role": "user", "content": text}
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

def get_text_from_pdf(pdf_files):
    text = ""
    for pdf in pdf_files:
        reader = PyPDF2.PdfReader(pdf)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def get_text_from_txt(txt_files):
    text = ""
    for txt in txt_files:
        text += txt.read().decode("utf-8") + "\n"
    return text

def get_text_from_csv(csv_files):
    text = ""
    for csv in csv_files:
        df = pd.read_csv(csv)
        text += df.to_string() + "\n"
    return text

def create_vector_database(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = splitter.split_text(text)
    embeddings = OpenAIEmbeddings()
    vector_db = FAISS.from_texts(texts, embeddings)
    return vector_db

def get_answer(question, vector_db, analysis):
    retriever = vector_db.as_retriever()
    docs = retriever.get_relevant_documents(question)
    
    if not docs:
        return "I could not find the answer in the documents. Do you want me to search external sources?"
    
    context = "\n".join([doc.page_content for doc in docs])
    language = detect_language(question)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents and their analyses to answer questions."},
            {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nAdditional insights:\n" + analysis}
        ]
    )
    return response["choices"][0]["message"]["content"]

def chatbot_interface(zip_file_path, question):
    if not zip_file_path:
        return "Please upload a ZIP file before asking a question."
    
    extracted_files = extract_files_from_zip(zip_file_path)
    text = get_text_from_pdf(extracted_files["pdf"]) + get_text_from_txt(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
    
    if not text:
        return "The ZIP file does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
    
    analysis = analyze_text(text)
    vector_db = create_vector_database(text)
    return get_answer(question, vector_db, analysis)

# Gradio interface
demo = gr.Interface(
    fn=chatbot_interface,
    inputs=[gr.File(label="Upload ZIP File"),
            gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
    outputs=gr.Textbox(label="Answer")
)

demo.launch()