File size: 5,223 Bytes
91c1e78
65881ce
 
 
 
94f3898
51ac55a
0302345
a52be2f
0530099
 
 
5baddf7
 
65881ce
 
 
 
 
 
 
 
 
 
91c1e78
65881ce
 
91c1e78
7e7a2a5
0302345
 
5baddf7
8b5a642
 
 
a008248
 
0302345
a008248
 
 
 
 
 
a64a105
 
0302345
 
8b5a642
 
fbe2154
 
a6d5350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c80efa
 
0302345
 
 
 
 
 
 
0c80efa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0302345
 
65881ce
0302345
 
65881ce
0c80efa
91c1e78
65881ce
 
 
a64a105
5baddf7
91c1e78
 
0c80efa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import docx
import requests
import json
from docx import Document
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

def detect_language(text):
    """Detects the language of the input text using OpenAI."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Detect the language of this text."},
            {"role": "user", "content": text}
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"

def extract_files_from_folder(folder_path):
    """Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
    extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
    
    print(f"Scanning folder: {folder_path}")
    for root, subdirs, files in os.walk(folder_path):
        print(f"Checking folder: {root}")  # Debugging log for subfolders
        for file_name in files:
            file_path = os.path.join(root, file_name)
            print(f"Found file: {file_path}")
            if file_name.endswith(".pdf"):
                extracted_files["pdf"].append(file_path)
            elif file_name.endswith(".txt"):
                extracted_files["txt"].append(file_path)
            elif file_name.endswith(".csv"):
                extracted_files["csv"].append(file_path)
            elif file_name.endswith(".docx"):
                extracted_files["docx"].append(file_path)
            elif file_name.endswith(".ipynb"):
                extracted_files["ipynb"].append(file_path)
    
    print("Files found:", extracted_files)  # Debugging log
    return extracted_files

def get_text_from_pdf(pdf_files):
    """Extracts text from PDF files."""
    text = ""
    for pdf_path in pdf_files:
        with open(pdf_path, "rb") as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    return text

def read_text_from_files(file_paths):
    """Reads text content from TXT files."""
    text = ""
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            text += file.read() + "\n"
    return text

def get_text_from_csv(csv_files):
    """Extracts text from CSV files."""
    text = ""
    for csv_path in csv_files:
        df = pd.read_csv(csv_path)
        text += df.to_string() + "\n"
    return text

def get_text_from_docx(docx_files):
    """Extracts text from DOCX files."""
    text = ""
    for docx_path in docx_files:
        doc = Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    return text

def get_text_from_ipynb(ipynb_files):
    """Extracts text from Jupyter Notebook (.ipynb) files."""
    text = ""
    for ipynb_path in ipynb_files:
        with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
            content = json.load(file)
            for cell in content.get("cells", []):
                if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
                    text += "\n".join(cell.get("source", [])) + "\n"
    return text

def combine_text_from_files(extracted_files):
    """Combines text from all extracted files."""
    text = (
        get_text_from_pdf(extracted_files["pdf"]) +
        read_text_from_files(extracted_files["txt"]) +
        get_text_from_csv(extracted_files["csv"]) +
        get_text_from_docx(extracted_files["docx"]) +
        get_text_from_ipynb(extracted_files["ipynb"])
    )
    return text

def generate_response(question, text):
    """Uses OpenAI to answer a question based on extracted text."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a data analytics assistant. Answer the question based on the provided document content."},
            {"role": "user", "content": f"{question}\n\nBased on the following document content:\n{text[:3000]}"}  # Limit to 3000 characters to avoid excessive token usage
        ]
    )
    return response["choices"][0]["message"]["content"].strip()

def chatbot_interface(question):
    folder_path = "New_Data_Analytics/"
    extracted_files = extract_files_from_folder(folder_path)
    
    text = combine_text_from_files(extracted_files)
    
    print("Final extracted text for chatbot processing:", text[:500])  # Debugging log (First 500 chars)
    
    if not text.strip():
        return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
    
    return generate_response(question, text)

# Gradio interface
demo = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
    outputs=gr.Textbox(label="Answer")
)

demo.launch()