Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -50,6 +50,52 @@ def extract_files_from_folder(folder_path):
|
|
50 |
print("Files found:", extracted_files) # Debugging log
|
51 |
return extracted_files
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def combine_text_from_files(extracted_files):
|
54 |
"""Combines text from all extracted files."""
|
55 |
text = (
|
|
|
50 |
print("Files found:", extracted_files) # Debugging log
|
51 |
return extracted_files
|
52 |
|
53 |
+
def get_text_from_pdf(pdf_files):
|
54 |
+
"""Extracts text from PDF files."""
|
55 |
+
text = ""
|
56 |
+
for pdf_path in pdf_files:
|
57 |
+
with open(pdf_path, "rb") as pdf_file:
|
58 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
59 |
+
for page in reader.pages:
|
60 |
+
text += page.extract_text() + "\n"
|
61 |
+
return text
|
62 |
+
|
63 |
+
def read_text_from_files(file_paths):
|
64 |
+
"""Reads text content from TXT files."""
|
65 |
+
text = ""
|
66 |
+
for file_path in file_paths:
|
67 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
68 |
+
text += file.read() + "\n"
|
69 |
+
return text
|
70 |
+
|
71 |
+
def get_text_from_csv(csv_files):
|
72 |
+
"""Extracts text from CSV files."""
|
73 |
+
text = ""
|
74 |
+
for csv_path in csv_files:
|
75 |
+
df = pd.read_csv(csv_path)
|
76 |
+
text += df.to_string() + "\n"
|
77 |
+
return text
|
78 |
+
|
79 |
+
def get_text_from_docx(docx_files):
|
80 |
+
"""Extracts text from DOCX files."""
|
81 |
+
text = ""
|
82 |
+
for docx_path in docx_files:
|
83 |
+
doc = Document(docx_path)
|
84 |
+
for para in doc.paragraphs:
|
85 |
+
text += para.text + "\n"
|
86 |
+
return text
|
87 |
+
|
88 |
+
def get_text_from_ipynb(ipynb_files):
|
89 |
+
"""Extracts text from Jupyter Notebook (.ipynb) files."""
|
90 |
+
text = ""
|
91 |
+
for ipynb_path in ipynb_files:
|
92 |
+
with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
|
93 |
+
content = json.load(file)
|
94 |
+
for cell in content.get("cells", []):
|
95 |
+
if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
|
96 |
+
text += "\n".join(cell.get("source", [])) + "\n"
|
97 |
+
return text
|
98 |
+
|
99 |
def combine_text_from_files(extracted_files):
|
100 |
"""Combines text from all extracted files."""
|
101 |
text = (
|