Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,17 +4,14 @@ import PyPDF2
|
|
4 |
import pandas as pd
|
5 |
import docx
|
6 |
import json
|
7 |
-
import requests
|
8 |
from docx import Document
|
9 |
-
from langchain_community.vectorstores import FAISS
|
10 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from transformers import pipeline
|
12 |
|
13 |
# Configurar Hugging Face API Token
|
14 |
HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
|
15 |
|
16 |
-
# Carregar o modelo
|
17 |
-
chatbot_pipeline = pipeline("text-generation", model="
|
18 |
|
19 |
def extract_files_from_folder(folder_path):
|
20 |
"""Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
|
@@ -66,28 +63,17 @@ def get_text_from_docx(docx_files):
|
|
66 |
text += para.text + "\n"
|
67 |
return text
|
68 |
|
69 |
-
def get_text_from_ipynb(ipynb_files):
|
70 |
-
text = ""
|
71 |
-
for ipynb_path in ipynb_files:
|
72 |
-
with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
|
73 |
-
content = json.load(file)
|
74 |
-
for cell in content.get("cells", []):
|
75 |
-
if cell.get("cell_type") in ["markdown", "code"]:
|
76 |
-
text += "\n".join(cell.get("source", [])) + "\n"
|
77 |
-
return text
|
78 |
-
|
79 |
def combine_text_from_files(extracted_files):
|
80 |
text = (
|
81 |
get_text_from_pdf(extracted_files["pdf"]) +
|
82 |
read_text_from_files(extracted_files["txt"]) +
|
83 |
get_text_from_csv(extracted_files["csv"]) +
|
84 |
-
get_text_from_docx(extracted_files["docx"])
|
85 |
-
get_text_from_ipynb(extracted_files["ipynb"])
|
86 |
)
|
87 |
return text
|
88 |
|
89 |
def generate_response(question, text):
|
90 |
-
"""Uses the
|
91 |
prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}" # Limite de 3000 caracteres
|
92 |
response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
|
93 |
return response.strip()
|
|
|
4 |
import pandas as pd
|
5 |
import docx
|
6 |
import json
|
|
|
7 |
from docx import Document
|
|
|
|
|
8 |
from transformers import pipeline
|
9 |
|
10 |
# Configurar Hugging Face API Token
|
11 |
HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
|
12 |
|
13 |
+
# Carregar o modelo DeepSeek Coder 1.3B
|
14 |
+
chatbot_pipeline = pipeline("text-generation", model="deepseek-ai/deepseek-coder-1.3b", token=HF_API_TOKEN)
|
15 |
|
16 |
def extract_files_from_folder(folder_path):
|
17 |
"""Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
|
|
|
63 |
text += para.text + "\n"
|
64 |
return text
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def combine_text_from_files(extracted_files):
|
67 |
text = (
|
68 |
get_text_from_pdf(extracted_files["pdf"]) +
|
69 |
read_text_from_files(extracted_files["txt"]) +
|
70 |
get_text_from_csv(extracted_files["csv"]) +
|
71 |
+
get_text_from_docx(extracted_files["docx"])
|
|
|
72 |
)
|
73 |
return text
|
74 |
|
75 |
def generate_response(question, text):
|
76 |
+
"""Uses the DeepSeek Coder model to answer questions based on extracted text."""
|
77 |
prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}" # Limite de 3000 caracteres
|
78 |
response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
|
79 |
return response.strip()
|