Rafa1986's picture
Update app.py
94f3898 verified
raw
history blame
5.57 kB
import gradio as gr
import os
import PyPDF2
import pandas as pd
import openai
import docx
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
def detect_language(text):
"""Detects the language of the input text using OpenAI."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Detect the language of this text."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
# Set up OpenAI API key (replace with your key)
openai.api_key = "YOUR_OPENAI_API_KEY"
def extract_files_from_folder(folder_path):
"""Scans a folder and its subfolders for PDF, TXT, CSV, and DOCX files."""
extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": []}
print(f"Scanning folder: {folder_path}")
for root, subdirs, files in os.walk(folder_path):
print(f"Checking folder: {root}") # Debugging log for subfolders
for file_name in files:
file_path = os.path.join(root, file_name)
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(file_path)
elif file_name.endswith(".txt"):
extracted_files["txt"].append(file_path)
elif file_name.endswith(".csv"):
extracted_files["csv"].append(file_path)
elif file_name.endswith(".docx"):
extracted_files["docx"].append(file_path)
print("Files found:", extracted_files) # Debugging log
return extracted_files
def read_text_from_files(file_paths):
"""Reads text content from a list of files."""
text = ""
for file_path in file_paths:
print(f"Reading text file: {file_path}") # Debugging log
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
text += file.read() + "\n"
return text
def get_text_from_pdf(pdf_files):
text = ""
for pdf_path in pdf_files:
print(f"Reading PDF file: {pdf_path}") # Debugging log
with open(pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
else:
text += "[Could not extract text from this page]\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv_path in csv_files:
print(f"Reading CSV file: {csv_path}") # Debugging log
df = pd.read_csv(csv_path)
text += df.to_string() + "\n"
return text
def get_text_from_docx(docx_files):
text = ""
for docx_path in docx_files:
print(f"Reading DOCX file: {docx_path}") # Debugging log
doc = docx.Document(docx_path)
for para in doc.paragraphs:
text += para.text + "\n"
return text
def create_vector_database(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vector_db = FAISS.from_texts(texts, embeddings)
return vector_db
def correct_exercises(text):
"""Uses OpenAI to correct and complete exercises found in the documents."""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
{"role": "user", "content": text}
]
)
return response["choices"][0]["message"]["content"].strip()
def get_answer(question, vector_db, corrected_exercises):
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(question)
if not docs:
return "I could not find the answer in the documents. Do you want me to search external sources?"
context = "\n".join([doc.page_content for doc in docs])
language = detect_language(question)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
]
)
return response["choices"][0]["message"]["content"]
def chatbot_interface(question):
folder_path = "/mnt/data/Data Analitics/"
extracted_files = extract_files_from_folder(folder_path)
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
if not text:
return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
corrected_exercises = correct_exercises(text)
vector_db = create_vector_database(text)
return get_answer(question, vector_db, corrected_exercises)
# Gradio interface
demo = gr.Interface(
fn=chatbot_interface,
inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
outputs=gr.Textbox(label="Answer")
)
demo.launch()