File size: 3,339 Bytes
91c1e78
65881ce
 
 
94f3898
0302345
a52be2f
d99694d
5baddf7
d99694d
 
 
04d8315
447ae2a
91c1e78
7e7a2a5
d99694d
0302345
5baddf7
d99694d
a008248
 
 
 
 
 
 
 
a64a105
 
0302345
 
fbe2154
 
a6d5350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c80efa
0302345
 
 
 
04d8315
0302345
0c80efa
 
 
04d8315
d99694d
 
 
0c80efa
 
 
 
 
0302345
 
d99694d
65881ce
0c80efa
91c1e78
65881ce
 
a64a105
5baddf7
91c1e78
 
0c80efa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import os
import PyPDF2
import pandas as pd
import docx
import json
from docx import Document
from transformers import pipeline

# Configurar Hugging Face API Token
HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")

# Carregar o modelo DeepSeek Coder 1.3B
chatbot_pipeline = pipeline("text-generation", model="deepseek-ai/deepseek-coder-1.3b-instruct", token=HF_API_TOKEN)

def extract_files_from_folder(folder_path):
    """Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
    extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
    
    for root, _, files in os.walk(folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if file_name.endswith(".pdf"):
                extracted_files["pdf"].append(file_path)
            elif file_name.endswith(".txt"):
                extracted_files["txt"].append(file_path)
            elif file_name.endswith(".csv"):
                extracted_files["csv"].append(file_path)
            elif file_name.endswith(".docx"):
                extracted_files["docx"].append(file_path)
            elif file_name.endswith(".ipynb"):
                extracted_files["ipynb"].append(file_path)
    return extracted_files

def get_text_from_pdf(pdf_files):
    text = ""
    for pdf_path in pdf_files:
        with open(pdf_path, "rb") as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    return text

def read_text_from_files(file_paths):
    text = ""
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            text += file.read() + "\n"
    return text

def get_text_from_csv(csv_files):
    text = ""
    for csv_path in csv_files:
        df = pd.read_csv(csv_path)
        text += df.to_string() + "\n"
    return text

def get_text_from_docx(docx_files):
    text = ""
    for docx_path in docx_files:
        doc = Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    return text

def combine_text_from_files(extracted_files):
    text = (
        get_text_from_pdf(extracted_files["pdf"]) +
        read_text_from_files(extracted_files["txt"]) +
        get_text_from_csv(extracted_files["csv"]) +
        get_text_from_docx(extracted_files["docx"])
    )
    return text

def generate_response(question, text):
    """Uses the DeepSeek Coder model to answer questions based on extracted text."""
    prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}"  # Limite de 3000 caracteres
    response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
    return response.strip()

def chatbot_interface(question):
    folder_path = "New_Data_Analytics/"
    extracted_files = extract_files_from_folder(folder_path)
    text = combine_text_from_files(extracted_files)
    
    if not text.strip():
        return "No valid files found. Please upload supported file types."
    
    return generate_response(question, text)

demo = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
    outputs=gr.Textbox(label="Answer")
)

demo.launch()