Rafa1986 commited on
Commit
a6d5350
·
verified ·
1 Parent(s): 0c80efa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py CHANGED
@@ -50,6 +50,52 @@ def extract_files_from_folder(folder_path):
50
  print("Files found:", extracted_files) # Debugging log
51
  return extracted_files
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def combine_text_from_files(extracted_files):
54
  """Combines text from all extracted files."""
55
  text = (
 
50
  print("Files found:", extracted_files) # Debugging log
51
  return extracted_files
52
 
53
+ def get_text_from_pdf(pdf_files):
54
+ """Extracts text from PDF files."""
55
+ text = ""
56
+ for pdf_path in pdf_files:
57
+ with open(pdf_path, "rb") as pdf_file:
58
+ reader = PyPDF2.PdfReader(pdf_file)
59
+ for page in reader.pages:
60
+ text += page.extract_text() + "\n"
61
+ return text
62
+
63
+ def read_text_from_files(file_paths):
64
+ """Reads text content from TXT files."""
65
+ text = ""
66
+ for file_path in file_paths:
67
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
68
+ text += file.read() + "\n"
69
+ return text
70
+
71
+ def get_text_from_csv(csv_files):
72
+ """Extracts text from CSV files."""
73
+ text = ""
74
+ for csv_path in csv_files:
75
+ df = pd.read_csv(csv_path)
76
+ text += df.to_string() + "\n"
77
+ return text
78
+
79
+ def get_text_from_docx(docx_files):
80
+ """Extracts text from DOCX files."""
81
+ text = ""
82
+ for docx_path in docx_files:
83
+ doc = Document(docx_path)
84
+ for para in doc.paragraphs:
85
+ text += para.text + "\n"
86
+ return text
87
+
88
+ def get_text_from_ipynb(ipynb_files):
89
+ """Extracts text from Jupyter Notebook (.ipynb) files."""
90
+ text = ""
91
+ for ipynb_path in ipynb_files:
92
+ with open(ipynb_path, "r", encoding="utf-8", errors="ignore") as file:
93
+ content = json.load(file)
94
+ for cell in content.get("cells", []):
95
+ if cell.get("cell_type") == "markdown" or cell.get("cell_type") == "code":
96
+ text += "\n".join(cell.get("source", [])) + "\n"
97
+ return text
98
+
99
  def combine_text_from_files(extracted_files):
100
  """Combines text from all extracted files."""
101
  text = (