Spaces:
Running
Running
Update document_chunker.py
Browse files- document_chunker.py +17 -2
document_chunker.py
CHANGED
@@ -70,9 +70,23 @@ class DocumentChunker:
|
|
70 |
sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
|
71 |
return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
|
72 |
|
73 |
-
def extract_text_from_docx(self, file_path: str) -> str:
|
|
|
|
|
|
|
|
|
74 |
doc = Document(file_path)
|
75 |
return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
def detect_document_type(self, text: str) -> str:
|
78 |
keywords = ['grant', 'funding', 'mission']
|
@@ -144,7 +158,8 @@ class DocumentChunker:
|
|
144 |
|
145 |
def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
|
146 |
file_path = Path(file_path)
|
147 |
-
text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
|
|
|
148 |
doc_type = self.detect_document_type(text)
|
149 |
headers = self.extract_headers(text, doc_type)
|
150 |
raw_chunks = self.chunk_by_headers(text, headers)
|
|
|
70 |
sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
|
71 |
return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
|
72 |
|
73 |
+
# def extract_text_from_docx(self, file_path: str) -> str:
|
74 |
+
# doc = Document(file_path)
|
75 |
+
# return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
76 |
+
def extract_text(self, file_path: str) -> str:
|
77 |
+
if file_path.endswith(".docx"):
|
78 |
doc = Document(file_path)
|
79 |
return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
80 |
+
elif file_path.endswith(".pdf"):
|
81 |
+
import fitz # PyMuPDF
|
82 |
+
text = ""
|
83 |
+
with fitz.open(file_path) as doc:
|
84 |
+
for page in doc:
|
85 |
+
text += page.get_text()
|
86 |
+
return text
|
87 |
+
else:
|
88 |
+
return Path(file_path).read_text()
|
89 |
+
|
90 |
|
91 |
def detect_document_type(self, text: str) -> str:
|
92 |
keywords = ['grant', 'funding', 'mission']
|
|
|
158 |
|
159 |
def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
|
160 |
file_path = Path(file_path)
|
161 |
+
# text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
|
162 |
+
text = self.extract_text(str(file_path))
|
163 |
doc_type = self.detect_document_type(text)
|
164 |
headers = self.extract_headers(text, doc_type)
|
165 |
raw_chunks = self.chunk_by_headers(text, headers)
|