Rafa1986's picture
Update app.py
447ae2a verified
import gradio as gr
import os
import PyPDF2
import pandas as pd
import docx
import json
from docx import Document
from transformers import pipeline
# Configurar Hugging Face API Token
HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
# Carregar o modelo DeepSeek Coder 1.3B
chatbot_pipeline = pipeline("text-generation", model="deepseek-ai/deepseek-coder-1.3b-instruct", token=HF_API_TOKEN)
def extract_files_from_folder(folder_path):
"""Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
for root, _, files in os.walk(folder_path):
for file_name in files:
file_path = os.path.join(root, file_name)
if file_name.endswith(".pdf"):
extracted_files["pdf"].append(file_path)
elif file_name.endswith(".txt"):
extracted_files["txt"].append(file_path)
elif file_name.endswith(".csv"):
extracted_files["csv"].append(file_path)
elif file_name.endswith(".docx"):
extracted_files["docx"].append(file_path)
elif file_name.endswith(".ipynb"):
extracted_files["ipynb"].append(file_path)
return extracted_files
def get_text_from_pdf(pdf_files):
text = ""
for pdf_path in pdf_files:
with open(pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def read_text_from_files(file_paths):
text = ""
for file_path in file_paths:
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
text += file.read() + "\n"
return text
def get_text_from_csv(csv_files):
text = ""
for csv_path in csv_files:
df = pd.read_csv(csv_path)
text += df.to_string() + "\n"
return text
def get_text_from_docx(docx_files):
text = ""
for docx_path in docx_files:
doc = Document(docx_path)
for para in doc.paragraphs:
text += para.text + "\n"
return text
def combine_text_from_files(extracted_files):
text = (
get_text_from_pdf(extracted_files["pdf"]) +
read_text_from_files(extracted_files["txt"]) +
get_text_from_csv(extracted_files["csv"]) +
get_text_from_docx(extracted_files["docx"])
)
return text
def generate_response(question, text):
"""Uses the DeepSeek Coder model to answer questions based on extracted text."""
prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}" # Limite de 3000 caracteres
response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
return response.strip()
def chatbot_interface(question):
folder_path = "New_Data_Analytics/"
extracted_files = extract_files_from_folder(folder_path)
text = combine_text_from_files(extracted_files)
if not text.strip():
return "No valid files found. Please upload supported file types."
return generate_response(question, text)
demo = gr.Interface(
fn=chatbot_interface,
inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
outputs=gr.Textbox(label="Answer")
)
demo.launch()