Spaces:

Tesneem
/

document_chunker

Running

App Files Files Community

Tesneem commited on 6 days ago

Commit

7e6f24f

verified ·

1 Parent(s): 3246e10

Update document_chunker.py

Browse files

Files changed (1) hide show

document_chunker.py +17 -2

document_chunker.py CHANGED Viewed

@@ -70,9 +70,23 @@ class DocumentChunker:
         sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
         return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
-    def extract_text_from_docx(self, file_path: str) -> str:
         doc = Document(file_path)
         return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
     def detect_document_type(self, text: str) -> str:
         keywords = ['grant', 'funding', 'mission']
@@ -144,7 +158,8 @@ class DocumentChunker:
     def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
         file_path = Path(file_path)
-        text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
         doc_type = self.detect_document_type(text)
         headers = self.extract_headers(text, doc_type)
         raw_chunks = self.chunk_by_headers(text, headers)

         sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
         return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
+    # def extract_text_from_docx(self, file_path: str) -> str:
+    #     doc = Document(file_path)
+    #     return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
+    def extract_text(self, file_path: str) -> str:
+    if file_path.endswith(".docx"):
         doc = Document(file_path)
         return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
+    elif file_path.endswith(".pdf"):
+        import fitz  # PyMuPDF
+        text = ""
+        with fitz.open(file_path) as doc:
+            for page in doc:
+                text += page.get_text()
+        return text
+    else:
+        return Path(file_path).read_text()
     def detect_document_type(self, text: str) -> str:
         keywords = ['grant', 'funding', 'mission']
     def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
         file_path = Path(file_path)
+        # text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
+        text = self.extract_text(str(file_path))
         doc_type = self.detect_document_type(text)
         headers = self.extract_headers(text, doc_type)
         raw_chunks = self.chunk_by_headers(text, headers)