Tesneem commited on
Commit
7e6f24f
·
verified ·
1 Parent(s): 3246e10

Update document_chunker.py

Browse files
Files changed (1) hide show
  1. document_chunker.py +17 -2
document_chunker.py CHANGED
@@ -70,9 +70,23 @@ class DocumentChunker:
70
  sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
71
  return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
72
 
73
- def extract_text_from_docx(self, file_path: str) -> str:
 
 
 
 
74
  doc = Document(file_path)
75
  return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
 
 
 
 
 
 
 
 
 
 
76
 
77
  def detect_document_type(self, text: str) -> str:
78
  keywords = ['grant', 'funding', 'mission']
@@ -144,7 +158,8 @@ class DocumentChunker:
144
 
145
  def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
146
  file_path = Path(file_path)
147
- text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
 
148
  doc_type = self.detect_document_type(text)
149
  headers = self.extract_headers(text, doc_type)
150
  raw_chunks = self.chunk_by_headers(text, headers)
 
70
  sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
71
  return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
72
 
73
+ # def extract_text_from_docx(self, file_path: str) -> str:
74
+ # doc = Document(file_path)
75
+ # return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
76
+ def extract_text(self, file_path: str) -> str:
77
+ if file_path.endswith(".docx"):
78
  doc = Document(file_path)
79
  return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
80
+ elif file_path.endswith(".pdf"):
81
+ import fitz # PyMuPDF
82
+ text = ""
83
+ with fitz.open(file_path) as doc:
84
+ for page in doc:
85
+ text += page.get_text()
86
+ return text
87
+ else:
88
+ return Path(file_path).read_text()
89
+
90
 
91
  def detect_document_type(self, text: str) -> str:
92
  keywords = ['grant', 'funding', 'mission']
 
158
 
159
  def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
160
  file_path = Path(file_path)
161
+ # text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
162
+ text = self.extract_text(str(file_path))
163
  doc_type = self.detect_document_type(text)
164
  headers = self.extract_headers(text, doc_type)
165
  raw_chunks = self.chunk_by_headers(text, headers)