Spaces:

Rafa1986
/

Data-Analytics-Class

Sleeping

App Files Files Community

Rafa1986 commited on Mar 14

Commit

0302345

verified ·

1 Parent(s): 208be4a

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -46

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 import openai
 import docx
 import requests
 from docx import Document
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
@@ -26,14 +27,15 @@ def detect_language(text):
 openai.api_key = "YOUR_OPENAI_API_KEY"
 def extract_files_from_folder(folder_path):
-    """Scans a folder and its subfolders for PDF, TXT, CSV, and DOCX files."""
-    extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": []}
     print(f"Scanning folder: {folder_path}")
     for root, subdirs, files in os.walk(folder_path):
         print(f"Checking folder: {root}")  # Debugging log for subfolders
         for file_name in files:
             file_path = os.path.join(root, file_name)
             if file_name.endswith(".pdf"):
                 extracted_files["pdf"].append(file_path)
             elif file_name.endswith(".txt"):
@@ -42,26 +44,12 @@ def extract_files_from_folder(folder_path):
                 extracted_files["csv"].append(file_path)
             elif file_name.endswith(".docx"):
                 extracted_files["docx"].append(file_path)
     print("Files found:", extracted_files)  # Debugging log
     return extracted_files
-def extract_links_from_text(text):
-    """Extracts links from text files and fetches their content."""
-    import re
-    links = re.findall(r'https?://\S+', text)
-    extracted_content = ""
-    for link in links:
-        try:
-            response = requests.get(link, timeout=5)
-            if response.status_code == 200:
-                extracted_content += f"\n[Extracted from {link}]\n" + response.text[:1000]  # Limit to first 1000 chars
-        except requests.exceptions.RequestException:
-            extracted_content += f"\n[Could not access {link}]\n"
-    return extracted_content
 def read_text_from_files(file_paths):
     """Reads text content from a list of files."""
     text = ""
@@ -70,7 +58,7 @@ def read_text_from_files(file_paths):
         with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
             file_text = file.read()
             text += file_text + "\n"
-            text += extract_links_from_text(file_text)  # Extract and add web content
     return text
 def get_text_from_pdf(pdf_files):
@@ -83,8 +71,7 @@ def get_text_from_pdf(pdf_files):
                 page_text = page.extract_text()
                 if page_text:
                     text += page_text + "\n"
-                else:
-                    text += "[Could not extract text from this page]\n"
     return text
 def get_text_from_csv(csv_files):
@@ -93,6 +80,7 @@ def get_text_from_csv(csv_files):
         print(f"Reading CSV file: {csv_path}")  # Debugging log
         df = pd.read_csv(csv_path)
         text += df.to_string() + "\n"
     return text
 def get_text_from_docx(docx_files):
@@ -102,38 +90,39 @@ def get_text_from_docx(docx_files):
         doc = Document(docx_path)
         for para in doc.paragraphs:
             text += para.text + "\n"
     return text
-def create_vector_database(text):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    texts = splitter.split_text(text)
-    embeddings = OpenAIEmbeddings()
-    vector_db = FAISS.from_texts(texts, embeddings)
-    return vector_db
-def correct_exercises(text):
-    """Uses OpenAI to correct and complete exercises found in the documents."""
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
-            {"role": "user", "content": text}
-        ]
-    )
-    return response["choices"][0]["message"]["content"].strip()
 def chatbot_interface(question):
-    folder_path = "/mnt/data/New_Data_Analytics/"
     extracted_files = extract_files_from_folder(folder_path)
-    text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
-    if not text:
-        return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
-    corrected_exercises = correct_exercises(text)
-    vector_db = create_vector_database(text)
-    return get_answer(question, vector_db, corrected_exercises)
 # Gradio interface
 demo = gr.Interface(
@@ -142,4 +131,4 @@ demo = gr.Interface(
     outputs=gr.Textbox(label="Answer")
 )
-demo.launch()

 import openai
 import docx
 import requests
+import json
 from docx import Document
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
 openai.api_key = "YOUR_OPENAI_API_KEY"
 def extract_files_from_folder(folder_path):
+    """Scans a folder and its subfolders for PDF, TXT, CSV, DOCX, and IPYNB files."""
+    extracted_files = {"pdf": [], "txt": [], "csv": [], "docx": [], "ipynb": []}
     print(f"Scanning folder: {folder_path}")
     for root, subdirs, files in os.walk(folder_path):
         print(f"Checking folder: {root}")  # Debugging log for subfolders
         for file_name in files:
             file_path = os.path.join(root, file_name)
+            print(f"Found file: {file_path}")
             if file_name.endswith(".pdf"):
                 extracted_files["pdf"].append(file_path)
             elif file_name.endswith(".txt"):
                 extracted_files["csv"].append(file_path)
             elif file_name.endswith(".docx"):
                 extracted_files["docx"].append(file_path)
+            elif file_name.endswith(".ipynb"):
+                extracted_files["ipynb"].append(file_path)
     print("Files found:", extracted_files)  # Debugging log
     return extracted_files
 def read_text_from_files(file_paths):
     """Reads text content from a list of files."""
     text = ""
         with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
             file_text = file.read()
             text += file_text + "\n"
+    print("Extracted text from TXT files:", text[:500])  # Debugging log (First 500 chars)
     return text
 def get_text_from_pdf(pdf_files):
                 page_text = page.extract_text()
                 if page_text:
                     text += page_text + "\n"
+    print("Extracted text from PDF files:", text[:500])  # Debugging log (First 500 chars)
     return text
 def get_text_from_csv(csv_files):
         print(f"Reading CSV file: {csv_path}")  # Debugging log
         df = pd.read_csv(csv_path)
         text += df.to_string() + "\n"
+    print("Extracted text from CSV files:", text[:500])  # Debugging log (First 500 chars)
     return text
 def get_text_from_docx(docx_files):
         doc = Document(docx_path)
         for para in doc.paragraphs:
             text += para.text + "\n"
+    print("Extracted text from DOCX files:", text[:500])  # Debugging log (First 500 chars)
     return text
+def get_text_from_ipynb(ipynb_files):
+    text = ""
+    for ipynb_path in ipynb_files:
+        print(f"Reading IPYNB file: {ipynb_path}")  # Debugging log
+        with open(ipynb_path, "r", encoding="utf-8") as file:
+            notebook = json.load(file)
+            for cell in notebook.get("cells", []):
+                if cell.get("cell_type") == "markdown":
+                    text += "\n".join(cell.get("source", [])) + "\n"
+    print("Extracted text from IPYNB files:", text[:500])  # Debugging log (First 500 chars)
+    return text
 def chatbot_interface(question):
+    folder_path = "New_Data_Analytics/"
     extracted_files = extract_files_from_folder(folder_path)
+    text = (
+        get_text_from_pdf(extracted_files["pdf"]) +
+        read_text_from_files(extracted_files["txt"]) +
+        get_text_from_csv(extracted_files["csv"]) +
+        get_text_from_docx(extracted_files["docx"]) +
+        get_text_from_ipynb(extracted_files["ipynb"])
+    )
+    print("Final extracted text for chatbot processing:", text[:500])  # Debugging log (First 500 chars)
+    if not text.strip():
+        return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
+    return "Files successfully read. Processing question..."
 # Gradio interface
 demo = gr.Interface(
     outputs=gr.Textbox(label="Answer")
 )
+demo.launch()