Spaces:

Rafa1986
/

Data-Analytics-Class

Sleeping

App Files Files Community

Data-Analytics-Class / app.py

Rafa1986

Update app.py

a52be2f verified 4 months ago

raw

history blame

5.59 kB

	import gradio as gr
	import os
	import PyPDF2
	import pandas as pd
	import openai
	import docx
	from docx import Document
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_community.llms import OpenAI
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	def detect_language(text):
	"""Detects the language of the input text using OpenAI."""
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "Detect the language of this text."},
	{"role": "user", "content": text}
	]
	)
	return response["choices"][0]["message"]["content"].strip()

	# Set up OpenAI API key (replace with your key)
	openai.api_key = "YOUR_OPENAI_API_KEY"

	def extract_files_from_folder(folder_path):
	"""Scans a folder and its subfolders for PDF, TXT, CSV, and DOCX files."""
	extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": []}

	print(f"Scanning folder: {folder_path}")
	for root, subdirs, files in os.walk(folder_path):
	print(f"Checking folder: {root}") # Debugging log for subfolders
	for file_name in files:
	file_path = os.path.join(root, file_name)
	if file_name.endswith(".pdf"):
	extracted_files["pdf"].append(file_path)
	elif file_name.endswith(".txt"):
	extracted_files["txt"].append(file_path)
	elif file_name.endswith(".csv"):
	extracted_files["csv"].append(file_path)
	elif file_name.endswith(".docx"):
	extracted_files["docx"].append(file_path)

	print("Files found:", extracted_files) # Debugging log
	return extracted_files

	def read_text_from_files(file_paths):
	"""Reads text content from a list of files."""
	text = ""
	for file_path in file_paths:
	print(f"Reading text file: {file_path}") # Debugging log
	with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
	text += file.read() + "\n"
	return text

	def get_text_from_pdf(pdf_files):
	text = ""
	for pdf_path in pdf_files:
	print(f"Reading PDF file: {pdf_path}") # Debugging log
	with open(pdf_path, "rb") as pdf_file:
	reader = PyPDF2.PdfReader(pdf_file)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	else:
	text += "[Could not extract text from this page]\n"
	return text

	def get_text_from_csv(csv_files):
	text = ""
	for csv_path in csv_files:
	print(f"Reading CSV file: {csv_path}") # Debugging log
	df = pd.read_csv(csv_path)
	text += df.to_string() + "\n"
	return text

	def get_text_from_docx(docx_files):
	text = ""
	for docx_path in docx_files:
	print(f"Reading DOCX file: {docx_path}") # Debugging log
	doc = Document(docx_path)
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text

	def create_vector_database(text):
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	texts = splitter.split_text(text)
	embeddings = OpenAIEmbeddings()
	vector_db = FAISS.from_texts(texts, embeddings)
	return vector_db

	def correct_exercises(text):
	"""Uses OpenAI to correct and complete exercises found in the documents."""
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
	{"role": "user", "content": text}
	]
	)
	return response["choices"][0]["message"]["content"].strip()

	def get_answer(question, vector_db, corrected_exercises):
	retriever = vector_db.as_retriever()
	docs = retriever.get_relevant_documents(question)

	if not docs:
	return "I could not find the answer in the documents. Do you want me to search external sources?"

	context = "\n".join([doc.page_content for doc in docs])
	language = detect_language(question)
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
	{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
	]
	)
	return response["choices"][0]["message"]["content"]

	def chatbot_interface(question):
	folder_path = "/mnt/data/Data Analitics/"
	extracted_files = extract_files_from_folder(folder_path)

	text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])

	if not text:
	return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."

	corrected_exercises = correct_exercises(text)
	vector_db = create_vector_database(text)
	return get_answer(question, vector_db, corrected_exercises)

	# Gradio interface
	demo = gr.Interface(
	fn=chatbot_interface,
	inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
	outputs=gr.Textbox(label="Answer")
	)

	demo.launch()