Spaces:

Rafa1986
/

Data-Analytics-Class

Sleeping

App Files Files Community

Rafa1986 commited on Mar 15

Commit

d99694d

verified ·

1 Parent(s): a567f52

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -49

app.py CHANGED Viewed

@@ -2,40 +2,27 @@ import gradio as gr
 import os
 import PyPDF2
 import pandas as pd
-import openai
 import docx
 import json
 from docx import Document
-from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain_community.llms import OpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-def detect_language(text):
-    """Detects the language of the input text using OpenAI."""
-    api_key = os.getenv("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")
-    if not api_key:
-        raise ValueError("API Key da OpenAI não definida. Configure a variável de ambiente OPENAI_API_KEY.")
-    client = openai.Client(api_key=api_key)
-    response = client.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "Detect the language of this text."},
-            {"role": "user", "content": text}
-        ]
-    )
-    return response.choices[0].message.content.strip()
 def extract_files_from_folder(folder_path):
-    """Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
     extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
-    print(f"Scanning folder: {folder_path}")
-    for root, subdirs, files in os.walk(folder_path):
-        print(f"Checking folder: {root}")  # Debugging log for subfolders
         for file_name in files:
             file_path = os.path.join(root, file_name)
-            print(f"Found file: {file_path}")
             if file_name.endswith(".pdf"):
                 extracted_files["pdf"].append(file_path)
             elif file_name.endswith(".txt"):
@@ -46,12 +33,9 @@ def extract_files_from_folder(folder_path):
                 extracted_files["docx"].append(file_path)
             elif file_name.endswith(".ipynb"):
                 extracted_files["ipynb"].append(file_path)
-    print("Files found:", extracted_files)  # Debugging log
     return extracted_files
 def get_text_from_pdf(pdf_files):
-    """Extracts text from PDF files."""
     text = ""
     for pdf_path in pdf_files:
         with open(pdf_path, "rb") as pdf_file:
@@ -61,7 +45,6 @@ def get_text_from_pdf(pdf_files):
     return text
 def read_text_from_files(file_paths):
-    """Reads text content from TXT files."""
     text = ""
     for file_path in file_paths:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
@@ -69,7 +52,6 @@ def read_text_from_files(file_paths):
     return text
 def get_text_from_csv(csv_files):
-    """Extracts text from CSV files."""
     text = ""
     for csv_path in csv_files:
         df = pd.read_csv(csv_path)
@@ -77,7 +59,6 @@ def get_text_from_csv(csv_files):
     return text
 def get_text_from_docx(docx_files):
-    """Extracts text from DOCX files."""
     text = ""
     for docx_path in docx_files:
         doc = Document(docx_path)
@@ -86,18 +67,16 @@ def get_text_from_docx(docx_files):
     return text
 def get_text_from_ipynb(ipynb_files):
-    """Extracts text from Jupyter Notebook (.ipynb) files."""
     text = ""
     for ipynb_path in ipynb_files:
         with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
             content = json.load(file)
             for cell in content.get("cells", []):
-                if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
                     text += "\n".join(cell.get("source", [])) + "\n"
     return text
 def combine_text_from_files(extracted_files):
-    """Combines text from all extracted files."""
     text = (
         get_text_from_pdf(extracted_files["pdf"]) +
         read_text_from_files(extracted_files["txt"]) +
@@ -108,34 +87,21 @@ def combine_text_from_files(extracted_files):
     return text
 def generate_response(question, text):
-    """Uses OpenAI to answer a question based on extracted text."""
-    api_key = os.getenv("OPENAI_API_KEY")
-    if not api_key:
-        raise ValueError("API Key da OpenAI não definida. Configure a variável de ambiente OPENAI_API_KEY.")
-    client = openai.Client(api_key=api_key)
-    response = client.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "You are a data analytics assistant. Answer the question based on the provided document content."},
-            {"role": "user", "content": f"{question}\n\nBased on the following document content:\n{text[:3000]}"}  # Limit to 3000 characters to avoid excessive token usage
-        ]
-    )
-    return response.choices[0].message.content.strip()
 def chatbot_interface(question):
     folder_path = "New_Data_Analytics/"
     extracted_files = extract_files_from_folder(folder_path)
     text = combine_text_from_files(extracted_files)
-    print("Final extracted text for chatbot processing:", text[:500])  # Debugging log (First 500 chars)
     if not text.strip():
-        return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
     return generate_response(question, text)
-# Gradio interface
 demo = gr.Interface(
     fn=chatbot_interface,
     inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),

 import os
 import PyPDF2
 import pandas as pd
 import docx
 import json
+import requests
 from docx import Document
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from transformers import pipeline
+# Configurar Hugging Face API Token
+HF_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
+# Carregar o modelo Mistral 7B gratuitamente do Hugging Face
+chatbot_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1", token=HF_API_TOKEN)
 def extract_files_from_folder(folder_path):
+    """Scans a folder for PDF, TXT, CSV, DOCX, and IPYNB files."""
     extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
+    for root, _, files in os.walk(folder_path):
         for file_name in files:
             file_path = os.path.join(root, file_name)
             if file_name.endswith(".pdf"):
                 extracted_files["pdf"].append(file_path)
             elif file_name.endswith(".txt"):
                 extracted_files["docx"].append(file_path)
             elif file_name.endswith(".ipynb"):
                 extracted_files["ipynb"].append(file_path)
     return extracted_files
 def get_text_from_pdf(pdf_files):
     text = ""
     for pdf_path in pdf_files:
         with open(pdf_path, "rb") as pdf_file:
     return text
 def read_text_from_files(file_paths):
     text = ""
     for file_path in file_paths:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
     return text
 def get_text_from_csv(csv_files):
     text = ""
     for csv_path in csv_files:
         df = pd.read_csv(csv_path)
     return text
 def get_text_from_docx(docx_files):
     text = ""
     for docx_path in docx_files:
         doc = Document(docx_path)
     return text
 def get_text_from_ipynb(ipynb_files):
     text = ""
     for ipynb_path in ipynb_files:
         with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
             content = json.load(file)
             for cell in content.get("cells", []):
+                if cell.get("cell_type") in ["markdown", "code"]:
                     text += "\n".join(cell.get("source", [])) + "\n"
     return text
 def combine_text_from_files(extracted_files):
     text = (
         get_text_from_pdf(extracted_files["pdf"]) +
         read_text_from_files(extracted_files["txt"]) +
     return text
 def generate_response(question, text):
+    """Uses the Mistral 7B model to answer questions based on extracted text."""
+    prompt = f"Question: {question}\nBased on the following document content:\n{text[:3000]}"  # Limite de 3000 caracteres
+    response = chatbot_pipeline(prompt, max_length=500, truncation=True)[0]['generated_text']
+    return response.strip()
 def chatbot_interface(question):
     folder_path = "New_Data_Analytics/"
     extracted_files = extract_files_from_folder(folder_path)
     text = combine_text_from_files(extracted_files)
     if not text.strip():
+        return "No valid files found. Please upload supported file types."
     return generate_response(question, text)
 demo = gr.Interface(
     fn=chatbot_interface,
     inputs=gr.Textbox(label="Ask a question", placeholder="Type your question here..."),