File size: 5,574 Bytes
91c1e78
65881ce
 
 
 
94f3898
0530099
 
 
5baddf7
 
65881ce
 
 
 
 
 
 
 
 
 
91c1e78
65881ce
 
91c1e78
7e7a2a5
a64a105
 
5baddf7
8b5a642
 
 
a008248
 
 
 
 
 
 
 
a64a105
 
8b5a642
 
fbe2154
 
a008248
 
65881ce
a008248
8b5a642
a008248
 
65881ce
91c1e78
a008248
65881ce
a008248
8b5a642
a008248
 
 
94f3898
 
 
 
 
65881ce
91c1e78
65881ce
 
a008248
8b5a642
a008248
65881ce
 
91c1e78
94f3898
 
 
 
 
 
 
 
 
65881ce
 
 
 
 
 
91c1e78
a008248
 
 
 
 
 
 
 
 
 
 
 
65881ce
 
 
 
 
 
 
 
 
 
 
a008248
 
65881ce
 
 
91c1e78
be26077
 
a008248
 
94f3898
65881ce
 
a64a105
65881ce
a008248
65881ce
a008248
91c1e78
65881ce
 
 
a64a105
5baddf7
91c1e78
 
94f3898
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import docx
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

def detect_language(text):
    """Detects the language of the input text using OpenAI."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Detect the language of this text."},
            {"role": "user", "content": text}
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"

def extract_files_from_folder(folder_path):
    """Scans a folder and its subfolders for PDF, TXT, CSV, and DOCX files."""
    extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": []}
    
    print(f"Scanning folder: {folder_path}")
    for root, subdirs, files in os.walk(folder_path):
        print(f"Checking folder: {root}")  # Debugging log for subfolders
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if file_name.endswith(".pdf"):
                extracted_files["pdf"].append(file_path)
            elif file_name.endswith(".txt"):
                extracted_files["txt"].append(file_path)
            elif file_name.endswith(".csv"):
                extracted_files["csv"].append(file_path)
            elif file_name.endswith(".docx"):
                extracted_files["docx"].append(file_path)
    
    print("Files found:", extracted_files)  # Debugging log
    return extracted_files

def read_text_from_files(file_paths):
    """Reads text content from a list of files."""
    text = ""
    for file_path in file_paths:
        print(f"Reading text file: {file_path}")  # Debugging log
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            text += file.read() + "\n"
    return text

def get_text_from_pdf(pdf_files):
    text = ""
    for pdf_path in pdf_files:
        print(f"Reading PDF file: {pdf_path}")  # Debugging log
        with open(pdf_path, "rb") as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
                else:
                    text += "[Could not extract text from this page]\n"
    return text

def get_text_from_csv(csv_files):
    text = ""
    for csv_path in csv_files:
        print(f"Reading CSV file: {csv_path}")  # Debugging log
        df = pd.read_csv(csv_path)
        text += df.to_string() + "\n"
    return text

def get_text_from_docx(docx_files):
    text = ""
    for docx_path in docx_files:
        print(f"Reading DOCX file: {docx_path}")  # Debugging log
        doc = docx.Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    return text

def create_vector_database(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = splitter.split_text(text)
    embeddings = OpenAIEmbeddings()
    vector_db = FAISS.from_texts(texts, embeddings)
    return vector_db

def correct_exercises(text):
    """Uses OpenAI to correct and complete exercises found in the documents."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
            {"role": "user", "content": text}
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

def get_answer(question, vector_db, corrected_exercises):
    retriever = vector_db.as_retriever()
    docs = retriever.get_relevant_documents(question)
    
    if not docs:
        return "I could not find the answer in the documents. Do you want me to search external sources?"
    
    context = "\n".join([doc.page_content for doc in docs])
    language = detect_language(question)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
            {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
        ]
    )
    return response["choices"][0]["message"]["content"]

def chatbot_interface(question):
    folder_path = "/mnt/data/Data Analitics/"
    extracted_files = extract_files_from_folder(folder_path)
    
    text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
    
    if not text:
        return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
    
    corrected_exercises = correct_exercises(text)
    vector_db = create_vector_database(text)
    return get_answer(question, vector_db, corrected_exercises)

# Gradio interface
demo = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
    outputs=gr.Textbox(label="Answer")
)

demo.launch()