Spaces:

Rafa1986
/

Data-Analytics-Class

Sleeping

App Files Files Community

Rafa1986 commited on Mar 14

Commit

a6d5350

verified ·

1 Parent(s): 0c80efa

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -0

app.py CHANGED Viewed

@@ -50,6 +50,52 @@ def extract_files_from_folder(folder_path):
     print("Files found:", extracted_files)  # Debugging log
     return extracted_files
 def combine_text_from_files(extracted_files):
     """Combines text from all extracted files."""
     text = (

     print("Files found:", extracted_files)  # Debugging log
     return extracted_files
+def get_text_from_pdf(pdf_files):
+    """Extracts text from PDF files."""
+    text = ""
+    for pdf_path in pdf_files:
+        with open(pdf_path, "rb") as pdf_file:
+            reader = PyPDF2.PdfReader(pdf_file)
+            for page in reader.pages:
+                text += page.extract_text() + "\n"
+    return text
+def read_text_from_files(file_paths):
+    """Reads text content from TXT files."""
+    text = ""
+    for file_path in file_paths:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
+            text += file.read() + "\n"
+    return text
+def get_text_from_csv(csv_files):
+    """Extracts text from CSV files."""
+    text = ""
+    for csv_path in csv_files:
+        df = pd.read_csv(csv_path)
+        text += df.to_string() + "\n"
+    return text
+def get_text_from_docx(docx_files):
+    """Extracts text from DOCX files."""
+    text = ""
+    for docx_path in docx_files:
+        doc = Document(docx_path)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    return text
+def get_text_from_ipynb(ipynb_files):
+    """Extracts text from Jupyter Notebook (.ipynb) files."""
+    text = ""
+    for ipynb_path in ipynb_files:
+        with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
+            content = json.load(file)
+            for cell in content.get("cells", []):
+                if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
+                    text += "\n".join(cell.get("source", [])) + "\n"
+    return text
 def combine_text_from_files(extracted_files):
     """Combines text from all extracted files."""
     text = (