Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,6 +50,52 @@ def extract_files_from_folder(folder_path):
|
|
| 50 |
print("Files found:", extracted_files) # Debugging log
|
| 51 |
return extracted_files
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
def combine_text_from_files(extracted_files):
|
| 54 |
"""Combines text from all extracted files."""
|
| 55 |
text = (
|
|
|
|
| 50 |
print("Files found:", extracted_files) # Debugging log
|
| 51 |
return extracted_files
|
| 52 |
|
| 53 |
+
def get_text_from_pdf(pdf_files):
|
| 54 |
+
"""Extracts text from PDF files."""
|
| 55 |
+
text = ""
|
| 56 |
+
for pdf_path in pdf_files:
|
| 57 |
+
with open(pdf_path, "rb") as pdf_file:
|
| 58 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
| 59 |
+
for page in reader.pages:
|
| 60 |
+
text += page.extract_text() + "\n"
|
| 61 |
+
return text
|
| 62 |
+
|
| 63 |
+
def read_text_from_files(file_paths):
|
| 64 |
+
"""Reads text content from TXT files."""
|
| 65 |
+
text = ""
|
| 66 |
+
for file_path in file_paths:
|
| 67 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
| 68 |
+
text += file.read() + "\n"
|
| 69 |
+
return text
|
| 70 |
+
|
| 71 |
+
def get_text_from_csv(csv_files):
|
| 72 |
+
"""Extracts text from CSV files."""
|
| 73 |
+
text = ""
|
| 74 |
+
for csv_path in csv_files:
|
| 75 |
+
df = pd.read_csv(csv_path)
|
| 76 |
+
text += df.to_string() + "\n"
|
| 77 |
+
return text
|
| 78 |
+
|
| 79 |
+
def get_text_from_docx(docx_files):
|
| 80 |
+
"""Extracts text from DOCX files."""
|
| 81 |
+
text = ""
|
| 82 |
+
for docx_path in docx_files:
|
| 83 |
+
doc = Document(docx_path)
|
| 84 |
+
for para in doc.paragraphs:
|
| 85 |
+
text += para.text + "\n"
|
| 86 |
+
return text
|
| 87 |
+
|
| 88 |
+
def get_text_from_ipynb(ipynb_files):
|
| 89 |
+
"""Extracts text from Jupyter Notebook (.ipynb) files."""
|
| 90 |
+
text = ""
|
| 91 |
+
for ipynb_path in ipynb_files:
|
| 92 |
+
with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
|
| 93 |
+
content = json.load(file)
|
| 94 |
+
for cell in content.get("cells", []):
|
| 95 |
+
if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
|
| 96 |
+
text += "\n".join(cell.get("source", [])) + "\n"
|
| 97 |
+
return text
|
| 98 |
+
|
| 99 |
def combine_text_from_files(extracted_files):
|
| 100 |
"""Combines text from all extracted files."""
|
| 101 |
text = (
|