File size: 4,777 Bytes
91c1e78 65881ce 0530099 5baddf7 65881ce 91c1e78 65881ce 91c1e78 7e7a2a5 a008248 fbe2154 5baddf7 a008248 fbe2154 a008248 65881ce a008248 65881ce 91c1e78 a008248 65881ce a008248 65881ce 91c1e78 65881ce a008248 65881ce 91c1e78 65881ce 91c1e78 a008248 65881ce a008248 65881ce 91c1e78 be26077 a008248 5baddf7 a008248 65881ce a008248 65881ce a008248 65881ce a008248 91c1e78 65881ce a008248 5baddf7 91c1e78 65881ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_folder(folder_path):
"""Scans a folder and its subfolders for PDF, TXT, and CSV files."""
extracted_files = {"pdf": [], "txt": [], "csv": []}
for root, _, files in os.walk(folder_path):
for file_name in files:
file_path = os.path.join(root, file_name)
if file_name.endswith(".pdf"):
elif file_name.endswith(".txt"):
elif file_name.endswith(".csv"):
return extracted_files
def read_text_from_files(file_paths):
"""Reads text content from a list of files."""
text = ""
for file_path in file_paths:
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
text += + "\n"
return text
def get_text_from_pdf(pdf_files):
text = ""
for pdf_path in pdf_files:
with open(pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv_path in csv_files:
df = pd.read_csv(csv_path)
text += df.to_string() + "\n"
return text
def create_vector_database(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_texts(texts, embeddings)
return vector_db
def correct_exercises(text):
"""Uses OpenAI to correct and complete exercises found in the documents."""
response = openai.ChatCompletion.create(
{"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
{"role": "user", "content": text}
return response["choices"][0]["message"]["content"].strip()
def get_answer(question, vector_db, corrected_exercises):
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(question)
if not docs:
return "I could not find the answer in the documents. Do you want me to search external sources?"
context = "\n".join([doc.page_content for doc in docs])
language = detect_language(question)
response = openai.ChatCompletion.create(
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
return response["choices"][0]["message"]["content"]
def chatbot_interface(question):
folder_path = "/mnt/data/Data Analitics/"
if not folder_path:
return "Please provide a folder path before asking a question."
extracted_files = extract_files_from_folder(folder_path)
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
if not text:
return "The folder does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
corrected_exercises = correct_exercises(text)
vector_db = create_vector_database(text)
return get_answer(question, vector_db, corrected_exercises)
# Gradio interface
demo = gr.Interface(
inputs=[gr.Textbox(label="Folder Path", placeholder="Enter the path to the folder containing the documents"),
gr.Textbox(label="Ask a question", placeholder="Type your question here...")],