Spaces:

Rafa1986
/

Data-Analytics-Class

Sleeping

App Files Files Community

Rafa1986 commited on Mar 14

Commit

a008248

verified ·

1 Parent(s): 5baddf7

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -50

app.py CHANGED Viewed

@@ -3,14 +3,11 @@ import os
 import PyPDF2
 import pandas as pd
 import openai
-import zipfile
-from io import BytesIO
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.llms import OpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 def detect_language(text):
     """Detects the language of the input text using OpenAI."""
     response = openai.ChatCompletion.create(
@@ -25,52 +22,42 @@ def detect_language(text):
 # Set up OpenAI API key (replace with your key)
 openai.api_key = "YOUR_OPENAI_API_KEY"
-def extract_files_from_zip(zip_path):
-    """Extracts PDF, TXT, and CSV files from a ZIP archive, including subfolders."""
     extracted_files = {"pdf": [], "txt": [], "csv": []}
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        for file_name in zip_ref.namelist():
-            if file_name.endswith(('.pdf', '.txt', '.csv')):
-                with zip_ref.open(file_name) as file:
-                    content = file.read()
-                    if file_name.endswith(".pdf"):
-                        extracted_files["pdf"].append(BytesIO(content))
-                    elif file_name.endswith(".txt"):
-                        extracted_files["txt"].append(BytesIO(content))
-                    elif file_name.endswith(".csv"):
-                        extracted_files["csv"].append(BytesIO(content))
     return extracted_files
-def analyze_text(text):
-    """Uses OpenAI to analyze notes, links, and complementary information in the text."""
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "Analyze this document and extract key points, links, and complementary information."},
-            {"role": "user", "content": text}
-        ]
-    )
-    return response["choices"][0]["message"]["content"].strip()
-def get_text_from_pdf(pdf_files):
     text = ""
-    for pdf in pdf_files:
-        reader = PyPDF2.PdfReader(pdf)
-        for page in reader.pages:
-            text += page.extract_text() + "\n"
     return text
-def get_text_from_txt(txt_files):
     text = ""
-    for txt in txt_files:
-        text += txt.read().decode("utf-8") + "\n"
     return text
 def get_text_from_csv(csv_files):
     text = ""
-    for csv in csv_files:
-        df = pd.read_csv(csv)
         text += df.to_string() + "\n"
     return text
@@ -81,7 +68,18 @@ def create_vector_database(text):
     vector_db = FAISS.from_texts(texts, embeddings)
     return vector_db
-def get_answer(question, vector_db, analysis):
     retriever = vector_db.as_retriever()
     docs = retriever.get_relevant_documents(question)
@@ -93,33 +91,33 @@ def get_answer(question, vector_db, analysis):
     response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo",
         messages=[
-            {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents and their analyses to answer questions."},
-            {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nAdditional insights:\n" + analysis}
         ]
     )
     return response["choices"][0]["message"]["content"]
-def chatbot_interface(zip_file_path, question):
-    if not zip_file_path:
-        return "Please upload a ZIP file before asking a question."
-    extracted_files = extract_files_from_zip(zip_file_path)
-    text = get_text_from_pdf(extracted_files["pdf"]) + get_text_from_txt(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
     if not text:
-        return "The ZIP file does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
-    analysis = analyze_text(text)
     vector_db = create_vector_database(text)
-    return get_answer(question, vector_db, analysis)
 # Gradio interface
 demo = gr.Interface(
     fn=chatbot_interface,
-    inputs=[gr.File(label="Upload ZIP File"),
             gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
     outputs=gr.Textbox(label="Answer")
 )
 demo.launch()

 import PyPDF2
 import pandas as pd
 import openai
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.llms import OpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 def detect_language(text):
     """Detects the language of the input text using OpenAI."""
     response = openai.ChatCompletion.create(
 # Set up OpenAI API key (replace with your key)
 openai.api_key = "YOUR_OPENAI_API_KEY"
+def extract_files_from_folder(folder_path):
+    """Scans a folder and its subfolders for PDF, TXT, and CSV files."""
     extracted_files = {"pdf": [], "txt": [], "csv": []}
+    for root, _, files in os.walk(folder_path):
+        for file_name in files:
+            file_path = os.path.join(root, file_name)
+            if file_name.endswith(".pdf"):
+                extracted_files["pdf"].append(file_path)
+            elif file_name.endswith(".txt"):
+                extracted_files["txt"].append(file_path)
+            elif file_name.endswith(".csv"):
+                extracted_files["csv"].append(file_path)
     return extracted_files
+def read_text_from_files(file_paths):
+    """Reads text content from a list of files."""
     text = ""
+    for file_path in file_paths:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
+            text += file.read() + "\n"
     return text
+def get_text_from_pdf(pdf_files):
     text = ""
+    for pdf_path in pdf_files:
+        with open(pdf_path, "rb") as pdf_file:
+            reader = PyPDF2.PdfReader(pdf_file)
+            for page in reader.pages:
+                text += page.extract_text() + "\n"
     return text
 def get_text_from_csv(csv_files):
     text = ""
+    for csv_path in csv_files:
+        df = pd.read_csv(csv_path)
         text += df.to_string() + "\n"
     return text
     vector_db = FAISS.from_texts(texts, embeddings)
     return vector_db
+def correct_exercises(text):
+    """Uses OpenAI to correct and complete exercises found in the documents."""
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
+            {"role": "user", "content": text}
+        ]
+    )
+    return response["choices"][0]["message"]["content"].strip()
+def get_answer(question, vector_db, corrected_exercises):
     retriever = vector_db.as_retriever()
     docs = retriever.get_relevant_documents(question)
     response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo",
         messages=[
+            {"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
+            {"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
         ]
     )
     return response["choices"][0]["message"]["content"]
+def chatbot_interface(folder_path, question):
+    if not folder_path:
+        return "Please provide a folder path before asking a question."
+    extracted_files = extract_files_from_folder(folder_path)
+    text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
     if not text:
+        return "The folder does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
+    corrected_exercises = correct_exercises(text)
     vector_db = create_vector_database(text)
+    return get_answer(question, vector_db, corrected_exercises)
 # Gradio interface
 demo = gr.Interface(
     fn=chatbot_interface,
+    inputs=[gr.Textbox(label="Folder Path", placeholder="Enter the path to the folder containing the documents"),
             gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
     outputs=gr.Textbox(label="Answer")
 )
 demo.launch()