Rafa1986 commited on
Commit
d99694d
·
verified ·
1 Parent(s): a567f52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -49
app.py CHANGED
@@ -2,40 +2,27 @@ import gradio as gr
2
  import os
3
  import PyPDF2
4
  import pandas as pd
5
- import openai
6
  import docx
7
  import json
 
8
  from docx import Document
9
- from langchain_community.embeddings import OpenAIEmbeddings
10
  from langchain_community.vectorstores import FAISS
11
- from langchain_community.llms import OpenAI
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
13
 
14
- def detect_language(text):
15
- """Detects the language of the input text using OpenAI."""
16
- api_key = os.getenv("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")
17
- if not api_key:
18
- raise ValueError("API Key da OpenAI não definida. Configure a variável de ambiente OPENAI_API_KEY.")
19
- client = openai.Client(api_key=api_key)
20
- response = client.chat.completions.create(
21
- model="gpt-3.5-turbo",
22
- messages=[
23
- {"role": "system", "content": "Detect the language of this text."},
24
- {"role": "user", "content": text}
25
- ]
26
- )
27
- return response.choices[0].message.content.strip()
28
 
29
  def extract_files_from_folder(folder_path):
30
- """Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
31
  extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
32
 
33
- print(f"Scanning folder: {folder_path}")
34
- for root, subdirs, files in os.walk(folder_path):
35
- print(f"Checking folder: {root}") # Debugging log for subfolders
36
  for file_name in files:
37
  file_path = os.path.join(root, file_name)
38
- print(f"Found file: {file_path}")
39
  if file_name.endswith(".pdf"):
40
  extracted_files["pdf"].append(file_path)
41
  elif file_name.endswith(".txt"):
@@ -46,12 +33,9 @@ def extract_files_from_folder(folder_path):
46
  extracted_files["docx"].append(file_path)
47
  elif file_name.endswith(".ipynb"):
48
  extracted_files["ipynb"].append(file_path)
49
-
50
- print("Files found:", extracted_files) # Debugging log
51
  return extracted_files
52
 
53
  def get_text_from_pdf(pdf_files):
54
- """Extracts text from PDF files."""
55
  text = ""
56
  for pdf_path in pdf_files:
57
  with open(pdf_path, "rb") as pdf_file:
@@ -61,7 +45,6 @@ def get_text_from_pdf(pdf_files):
61
  return text
62
 
63
  def read_text_from_files(file_paths):
64
- """Reads text content from TXT files."""
65
  text = ""
66
  for file_path in file_paths:
67
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
@@ -69,7 +52,6 @@ def read_text_from_files(file_paths):
69
  return text
70
 
71
  def get_text_from_csv(csv_files):
72
- """Extracts text from CSV files."""
73
  text = ""
74
  for csv_path in csv_files:
75
  df = pd.read_csv(csv_path)
@@ -77,7 +59,6 @@ def get_text_from_csv(csv_files):
77
  return text
78
 
79
  def get_text_from_docx(docx_files):
80
- """Extracts text from DOCX files."""
81
  text = ""
82
  for docx_path in docx_files:
83
  doc = Document(docx_path)
@@ -86,18 +67,16 @@ def get_text_from_docx(docx_files):
86
  return text
87
 
88
  def get_text_from_ipynb(ipynb_files):
89
- """Extracts text from Jupyter Notebook (.ipynb) files."""
90
  text = ""
91
  for ipynb_path in ipynb_files:
92
  with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
93
  content = json.load(file)
94
  for cell in content.get("cells", []):
95
- if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
96
  text += "\n".join(cell.get("source", [])) + "\n"
97
  return text
98
 
99
  def combine_text_from_files(extracted_files):
100
- """Combines text from all extracted files."""
101
  text = (
102
  get_text_from_pdf(extracted_files["pdf"]) +
103
  read_text_from_files(extracted_files["txt"]) +
@@ -108,34 +87,21 @@ def combine_text_from_files(extracted_files):
108
  return text
109
 
110
  def generate_response(question, text):
111
- """Uses OpenAI to answer a question based on extracted text."""
112
- api_key = os.getenv("OPENAI_API_KEY")
113
- if not api_key:
114
- raise ValueError("API Key da OpenAI não definida. Configure a variável de ambiente OPENAI_API_KEY.")
115
- client = openai.Client(api_key=api_key)
116
- response = client.chat.completions.create(
117
- model="gpt-3.5-turbo",
118
- messages=[
119
- {"role": "system", "content": "You are a data analytics assistant. Answer the question based on the provided document content."},
120
- {"role": "user", "content": f"{question}\n\nBased on the following document content:\n{text[:3000]}"} # Limit to 3000 characters to avoid excessive token usage
121
- ]
122
- )
123
- return response.choices[0].message.content.strip()
124
 
125
  def chatbot_interface(question):
126
  folder_path = "New_Data_Analytics/"
127
  extracted_files = extract_files_from_folder(folder_path)
128
-
129
  text = combine_text_from_files(extracted_files)
130
 
131
- print("Final extracted text for chatbot processing:", text[:500]) # Debugging log (First 500 chars)
132
-
133
  if not text.strip():
134
- return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
135
 
136
  return generate_response(question, text)
137
 
138
- # Gradio interface
139
  demo = gr.Interface(
140
  fn=chatbot_interface,
141
  inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),
 
2
  import os
3
  import PyPDF2
4
  import pandas as pd
 
5
  import docx
6
  import json
7
+ import requests
8
  from docx import Document
 
9
  from langchain_community.vectorstores import FAISS
 
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from transformers import pipeline
12
 
13
+ # Configurar Hugging Face API Token
14
+ HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
15
+
16
+ # Carregar o modelo Mistral 7B gratuitamente do Hugging Face
17
+ chatbot_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1", token=HF_API_TOKEN)
 
 
 
 
 
 
 
 
 
18
 
19
  def extract_files_from_folder(folder_path):
20
+ """Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
21
  extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
22
 
23
+ for root, _, files in os.walk(folder_path):
 
 
24
  for file_name in files:
25
  file_path = os.path.join(root, file_name)
 
26
  if file_name.endswith(".pdf"):
27
  extracted_files["pdf"].append(file_path)
28
  elif file_name.endswith(".txt"):
 
33
  extracted_files["docx"].append(file_path)
34
  elif file_name.endswith(".ipynb"):
35
  extracted_files["ipynb"].append(file_path)
 
 
36
  return extracted_files
37
 
38
  def get_text_from_pdf(pdf_files):
 
39
  text = ""
40
  for pdf_path in pdf_files:
41
  with open(pdf_path, "rb") as pdf_file:
 
45
  return text
46
 
47
  def read_text_from_files(file_paths):
 
48
  text = ""
49
  for file_path in file_paths:
50
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
 
52
  return text
53
 
54
  def get_text_from_csv(csv_files):
 
55
  text = ""
56
  for csv_path in csv_files:
57
  df = pd.read_csv(csv_path)
 
59
  return text
60
 
61
  def get_text_from_docx(docx_files):
 
62
  text = ""
63
  for docx_path in docx_files:
64
  doc = Document(docx_path)
 
67
  return text
68
 
69
  def get_text_from_ipynb(ipynb_files):
 
70
  text = ""
71
  for ipynb_path in ipynb_files:
72
  with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
73
  content = json.load(file)
74
  for cell in content.get("cells", []):
75
+ if cell.get("cell_type") in ["markdown", "code"]:
76
  text += "\n".join(cell.get("source", [])) + "\n"
77
  return text
78
 
79
  def combine_text_from_files(extracted_files):
 
80
  text = (
81
  get_text_from_pdf(extracted_files["pdf"]) +
82
  read_text_from_files(extracted_files["txt"]) +
 
87
  return text
88
 
89
  def generate_response(question, text):
90
+ """Uses the Mistral 7B model to answer questions based on extracted text."""
91
+ prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}" # Limite de 3000 caracteres
92
+ response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
93
+ return response.strip()
 
 
 
 
 
 
 
 
 
94
 
95
  def chatbot_interface(question):
96
  folder_path = "New_Data_Analytics/"
97
  extracted_files = extract_files_from_folder(folder_path)
 
98
  text = combine_text_from_files(extracted_files)
99
 
 
 
100
  if not text.strip():
101
+ return "No valid files found. Please upload supported file types."
102
 
103
  return generate_response(question, text)
104
 
 
105
  demo = gr.Interface(
106
  fn=chatbot_interface,
107
  inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),