Spaces:

Rafa1986
/

Data-Analytics-Class

Sleeping

App Files Files Community

Data-Analytics-Class / app.py

Rafa1986

Update app.py

0302345 verified 6 months ago

raw

history blame

5.14 kB

	import gradio as gr
	import os
	import PyPDF2
	import pandas as pd
	import openai
	import docx
	import requests
	import json
	from docx import Document
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_community.llms import OpenAI
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	def detect_language(text):
	"""Detects the language of the input text using OpenAI."""
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "Detect the language of this text."},
	{"role": "user", "content": text}
	]
	)
	return response["choices"][0]["message"]["content"].strip()

	# Set up OpenAI API key (replace with your key)
	openai.api_key = "YOUR_OPENAI_API_KEY"

	def extract_files_from_folder(folder_path):
	"""Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
	extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}

	print(f"Scanning folder: {folder_path}")
	for root, subdirs, files in os.walk(folder_path):
	print(f"Checking folder: {root}") # Debugging log for subfolders
	for file_name in files:
	file_path = os.path.join(root, file_name)
	print(f"Found file: {file_path}")
	if file_name.endswith(".pdf"):
	extracted_files["pdf"].append(file_path)
	elif file_name.endswith(".txt"):
	extracted_files["txt"].append(file_path)
	elif file_name.endswith(".csv"):
	extracted_files["csv"].append(file_path)
	elif file_name.endswith(".docx"):
	extracted_files["docx"].append(file_path)
	elif file_name.endswith(".ipynb"):
	extracted_files["ipynb"].append(file_path)

	print("Files found:", extracted_files) # Debugging log
	return extracted_files

	def read_text_from_files(file_paths):
	"""Reads text content from a list of files."""
	text = ""
	for file_path in file_paths:
	print(f"Reading text file: {file_path}") # Debugging log
	with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
	file_text = file.read()
	text += file_text + "\n"
	print("Extracted text from TXT files:", text[:500]) # Debugging log (First 500 chars)
	return text

	def get_text_from_pdf(pdf_files):
	text = ""
	for pdf_path in pdf_files:
	print(f"Reading PDF file: {pdf_path}") # Debugging log
	with open(pdf_path, "rb") as pdf_file:
	reader = PyPDF2.PdfReader(pdf_file)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	print("Extracted text from PDF files:", text[:500]) # Debugging log (First 500 chars)
	return text

	def get_text_from_csv(csv_files):
	text = ""
	for csv_path in csv_files:
	print(f"Reading CSV file: {csv_path}") # Debugging log
	df = pd.read_csv(csv_path)
	text += df.to_string() + "\n"
	print("Extracted text from CSV files:", text[:500]) # Debugging log (First 500 chars)
	return text

	def get_text_from_docx(docx_files):
	text = ""
	for docx_path in docx_files:
	print(f"Reading DOCX file: {docx_path}") # Debugging log
	doc = Document(docx_path)
	for para in doc.paragraphs:
	text += para.text + "\n"
	print("Extracted text from DOCX files:", text[:500]) # Debugging log (First 500 chars)
	return text

	def get_text_from_ipynb(ipynb_files):
	text = ""
	for ipynb_path in ipynb_files:
	print(f"Reading IPYNB file: {ipynb_path}") # Debugging log
	with open(ipynb_path, "r", encoding="utf-8") as file:
	notebook = json.load(file)
	for cell in notebook.get("cells", []):
	if cell.get("cell_type") == "markdown":
	text += "\n".join(cell.get("source", [])) + "\n"
	print("Extracted text from IPYNB files:", text[:500]) # Debugging log (First 500 chars)
	return text

	def chatbot_interface(question):
	folder_path = "New_Data_Analytics/"
	extracted_files = extract_files_from_folder(folder_path)

	text = (
	get_text_from_pdf(extracted_files["pdf"]) +
	read_text_from_files(extracted_files["txt"]) +
	get_text_from_csv(extracted_files["csv"]) +
	get_text_from_docx(extracted_files["docx"]) +
	get_text_from_ipynb(extracted_files["ipynb"])
	)

	print("Final extracted text for chatbot processing:", text[:500]) # Debugging log (First 500 chars)

	if not text.strip():
	return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."

	return "Files successfully read. Processing question..."

	# Gradio interface
	demo = gr.Interface(
	fn=chatbot_interface,
	inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
	outputs=gr.Textbox(label="Answer")
	)

	demo.launch()