Rafa1986 commited on
Commit
04d8315
·
verified ·
1 Parent(s): d60623e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -18
app.py CHANGED
@@ -4,17 +4,14 @@ import PyPDF2
4
  import pandas as pd
5
  import docx
6
  import json
7
- import requests
8
  from docx import Document
9
- from langchain_community.vectorstores import FAISS
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from transformers import pipeline
12
 
13
  # Configurar Hugging Face API Token
14
  HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
15
 
16
- # Carregar o modelo Mistral 7B gratuitamente do Hugging Face
17
- chatbot_pipeline = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", token=HF_API_TOKEN)
18
 
19
  def extract_files_from_folder(folder_path):
20
  """Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
@@ -66,28 +63,17 @@ def get_text_from_docx(docx_files):
66
  text += para.text + "\n"
67
  return text
68
 
69
- def get_text_from_ipynb(ipynb_files):
70
- text = ""
71
- for ipynb_path in ipynb_files:
72
- with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
73
- content = json.load(file)
74
- for cell in content.get("cells", []):
75
- if cell.get("cell_type") in ["markdown", "code"]:
76
- text += "\n".join(cell.get("source", [])) + "\n"
77
- return text
78
-
79
  def combine_text_from_files(extracted_files):
80
  text = (
81
  get_text_from_pdf(extracted_files["pdf"]) +
82
  read_text_from_files(extracted_files["txt"]) +
83
  get_text_from_csv(extracted_files["csv"]) +
84
- get_text_from_docx(extracted_files["docx"]) +
85
- get_text_from_ipynb(extracted_files["ipynb"])
86
  )
87
  return text
88
 
89
  def generate_response(question, text):
90
- """Uses the Mistral 7B model to answer questions based on extracted text."""
91
  prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}" # Limite de 3000 caracteres
92
  response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
93
  return response.strip()
 
4
  import pandas as pd
5
  import docx
6
  import json
 
7
  from docx import Document
 
 
8
  from transformers import pipeline
9
 
10
  # Configurar Hugging Face API Token
11
  HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
12
 
13
+ # Carregar o modelo DeepSeek Coder 1.3B
14
+ chatbot_pipeline = pipeline("text-generation", model="deepseek-ai/deepseek-coder-1.3b", token=HF_API_TOKEN)
15
 
16
  def extract_files_from_folder(folder_path):
17
  """Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
 
63
  text += para.text + "\n"
64
  return text
65
 
 
 
 
 
 
 
 
 
 
 
66
  def combine_text_from_files(extracted_files):
67
  text = (
68
  get_text_from_pdf(extracted_files["pdf"]) +
69
  read_text_from_files(extracted_files["txt"]) +
70
  get_text_from_csv(extracted_files["csv"]) +
71
+ get_text_from_docx(extracted_files["docx"])
 
72
  )
73
  return text
74
 
75
  def generate_response(question, text):
76
+ """Uses the DeepSeek Coder model to answer questions based on extracted text."""
77
  prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}" # Limite de 3000 caracteres
78
  response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
79
  return response.strip()