Spaces:

Tesneem
/

document_chunker

Sleeping

App Files Files Community

Tesneem commited on Aug 8

Commit

5b0ad58

verified ·

1 Parent(s): 2cdf48b

Update document_chunker.py

Browse files

Files changed (1) hide show

document_chunker.py +232 -50

document_chunker.py CHANGED Viewed

@@ -3,12 +3,10 @@ from typing import List, Dict, Optional
 from pathlib import Path
 from collections import defaultdict
 from dataclasses import dataclass
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
-import fitz  # PyMuPDF
 @dataclass
 class DocumentChunk:
@@ -17,7 +15,6 @@ class DocumentChunk:
     embedding: List[float]
     metadata: Dict
 class DocumentChunker:
     def __init__(self):
         self.embed_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -42,21 +39,25 @@ class DocumentChunker:
         self.patterns = {
             'grant_application': {
-                'header_patterns': [
-                    r'\*\*([^*]+)\*\*',
-                    r'^([A-Z][^a-z]*[A-Z])$',
-                    r'^([A-Z][A-Za-z\s]+)$',
-                ],
-                'question_patterns': [
-                    r'^.+\?$',
-                    r'^\*?Please .+',
-                    r'^How .+',
-                    r'^What .+',
-                    r'^Describe .+',
-                ]
             }
         }
     def extract_text(self, file_path: str) -> str:
         if file_path.endswith(".docx"):
             doc = Document(file_path)
@@ -65,12 +66,10 @@ class DocumentChunker:
             text = ""
             with fitz.open(file_path) as doc:
                 for page in doc:
-                    text += page.get_text()
             return text
-        elif file_path.endswith(".txt"):
-            return Path(file_path).read_text()
         else:
-            raise ValueError("Unsupported file format")
     def detect_document_type(self, text: str) -> str:
         keywords = ['grant', 'funding', 'mission']
@@ -88,23 +87,27 @@ class DocumentChunker:
                 headers.append({'text': line, 'line_number': i, 'pattern_type': 'header'})
         return headers
     def chunk_by_headers(self, text: str, headers: List[Dict], max_words=150) -> List[Dict]:
         lines = text.split('\n')
         chunks = []
-        if not headers:
-            words = text.split()
-            for i in range(0, len(words), max_words):
-                piece = ' '.join(words[i:i + max_words])
-                chunks.append({
-                    'chunk_id': len(chunks) + 1,
-                    'header': '',
-                    'questions': [],
-                    'content': piece,
-                    'pattern_type': 'auto'
-                })
-            return chunks
         for i, header in enumerate(headers):
             start, end = header['line_number'], headers[i + 1]['line_number'] if i + 1 < len(headers) else len(lines)
             content_lines = lines[start + 1:end]
@@ -113,6 +116,8 @@ class DocumentChunker:
             for j in range(0, len(content.split()), max_words):
                 chunk_text = ' '.join(content.split()[j:j + max_words])
                 chunks.append({
                     'chunk_id': len(chunks) + 1,
                     'header': header['text'] if header['pattern_type'] == 'header' else '',
@@ -121,24 +126,11 @@ class DocumentChunker:
                     'pattern_type': header['pattern_type'],
                     'split_index': j // max_words
                 })
-        return chunks
-    def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
-        lower_text = text.lower()
-        match_scores = defaultdict(int)
-        for category, patterns in self.category_patterns.items():
-            for pattern in patterns:
-                matches = re.findall(pattern, lower_text)
-                match_scores[category] += len(matches)
-        if not match_scores:
-            return None if return_first else []
-        sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
-        return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
     def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
-        clean = re.sub(r'[^\w\s]', ' ', text.lower())
         vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
         tfidf = vectorizer.fit_transform([clean])
         terms = vectorizer.get_feature_names_out()
@@ -158,10 +150,12 @@ class DocumentChunker:
         text = self.extract_text(str(file_path))
         doc_type = self.detect_document_type(text)
         headers = self.extract_headers(text, doc_type)
-        raw_chunks = self.chunk_by_headers(text, headers)
         final_chunks = []
-        for chunk in raw_chunks:
             full_text = f"{chunk['header']} {' '.join(chunk['questions'])} {chunk['content']}".strip()
             category = self.match_category(full_text, return_first=True)
             categories = self.match_category(full_text, return_first=False)
@@ -179,8 +173,196 @@ class DocumentChunker:
                     "category": category,
                     "categories": categories,
                     "topics": topics,
                     "confidence_score": confidence
                 }
             })
         return final_chunks

 from pathlib import Path
 from collections import defaultdict
 from dataclasses import dataclass
+import fitz  # PyMuPDF
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
 @dataclass
 class DocumentChunk:
     embedding: List[float]
     metadata: Dict
 class DocumentChunker:
     def __init__(self):
         self.embed_model = SentenceTransformer("all-MiniLM-L6-v2")
         self.patterns = {
             'grant_application': {
+                'header_patterns': [r'\*\*([^*]+)\*\*', r'^([A-Z][^a-z]*[A-Z])$', r'^([A-Z][A-Za-z\s]+)$'],
+                'question_patterns': [r'^.+\?$', r'^\*?Please .+', r'^How .+', r'^What .+', r'^Describe .+']
             }
         }
+    def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
+        lower_text = text.lower()
+        match_scores = defaultdict(int)
+        for category, patterns in self.category_patterns.items():
+            for pattern in patterns:
+                matches = re.findall(pattern, lower_text)
+                match_scores[category] += len(matches)
+        if not match_scores:
+            return None if return_first else []
+        sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
+        return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
     def extract_text(self, file_path: str) -> str:
         if file_path.endswith(".docx"):
             doc = Document(file_path)
             text = ""
             with fitz.open(file_path) as doc:
                 for page in doc:
+                    text += page.get_text("text")  # More accurate reading order
             return text
         else:
+            return Path(file_path).read_text()
     def detect_document_type(self, text: str) -> str:
         keywords = ['grant', 'funding', 'mission']
                 headers.append({'text': line, 'line_number': i, 'pattern_type': 'header'})
         return headers
+    def fallback_chunking(self, text: str, max_words=150, stride=100) -> List[Dict]:
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words), stride):
+            chunk_text = ' '.join(words[i:i + max_words])
+            if len(chunk_text.split()) < 20:
+                continue
+            chunks.append({
+                'chunk_id': len(chunks) + 1,
+                'header': '',
+                'questions': [],
+                'content': chunk_text,
+                'pattern_type': 'fallback',
+                'split_index': i // stride
+            })
+        return chunks
     def chunk_by_headers(self, text: str, headers: List[Dict], max_words=150) -> List[Dict]:
         lines = text.split('\n')
         chunks = []
         for i, header in enumerate(headers):
             start, end = header['line_number'], headers[i + 1]['line_number'] if i + 1 < len(headers) else len(lines)
             content_lines = lines[start + 1:end]
             for j in range(0, len(content.split()), max_words):
                 chunk_text = ' '.join(content.split()[j:j + max_words])
+                if len(chunk_text.split()) < 20:
+                    continue
                 chunks.append({
                     'chunk_id': len(chunks) + 1,
                     'header': header['text'] if header['pattern_type'] == 'header' else '',
                     'pattern_type': header['pattern_type'],
                     'split_index': j // max_words
                 })
+        return chunks
     def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
+        clean = re.sub(r'[^a-z0-9\s]', ' ', text.lower())
         vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
         tfidf = vectorizer.fit_transform([clean])
         terms = vectorizer.get_feature_names_out()
         text = self.extract_text(str(file_path))
         doc_type = self.detect_document_type(text)
         headers = self.extract_headers(text, doc_type)
+        chunks = self.chunk_by_headers(text, headers)
+        if not chunks:
+            chunks = self.fallback_chunking(text)
         final_chunks = []
+        for chunk in chunks:
             full_text = f"{chunk['header']} {' '.join(chunk['questions'])} {chunk['content']}".strip()
             category = self.match_category(full_text, return_first=True)
             categories = self.match_category(full_text, return_first=False)
                     "category": category,
                     "categories": categories,
                     "topics": topics,
+                    "chunking_strategy": chunk['pattern_type'],
                     "confidence_score": confidence
                 }
             })
         return final_chunks
+# import re
+# from typing import List, Dict, Optional
+# from pathlib import Path
+# from collections import defaultdict
+# from dataclasses import dataclass
+# from docx import Document
+# from sentence_transformers import SentenceTransformer
+# from sklearn.feature_extraction.text import TfidfVectorizer
+# import fitz  # PyMuPDF
+# @dataclass
+# class DocumentChunk:
+#     chunk_id: int
+#     text: str
+#     embedding: List[float]
+#     metadata: Dict
+# class DocumentChunker:
+#     def __init__(self):
+#         self.embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+#         self.category_patterns = {
+#             "Project Summary": [r"\bsummary\b", r"\bproject overview\b"],
+#             "Contact Information": [r"\bcontact\b", r"\bemail\b", r"\bphone\b", r"\baddress\b"],
+#             "Problem/ Need": [r"\bproblem\b", r"\bneed\b", r"\bchallenge\b"],
+#             "Mission Statement": [r"\bmission\b", r"\bvision\b"],
+#             "Fit or Alignment to Grant": [r"\balignment\b", r"\bfit\b", r"\bgrant (focus|priority)\b"],
+#             "Goals/ Vision / Objectives": [r"\bgoals?\b", r"\bobjectives?\b", r"\bvision\b"],
+#             "Our Solution *PROGRAMS* and Approach": [r"\bsolution\b", r"\bprogram\b", r"\bapproach\b"],
+#             "Impact, Results, or Outcomes": [r"\bimpact\b", r"\bresults?\b", r"\boutcomes?\b"],
+#             "Beneficiaries": [r"\bbeneficiaries\b", r"\bwho we serve\b", r"\btarget audience\b"],
+#             "Differentiation with Competitors": [r"\bcompetitor\b", r"\bdifferent\b", r"\bvalue proposition\b"],
+#             "Plan and Timeline": [r"\btimeline\b", r"\bschedule\b", r"\bmilestone\b"],
+#             "Budget and Funding": [r"\bbudget\b", r"\bfunding\b", r"\bcost\b"],
+#             "Sustainability and Strategy": [r"\bsustainability\b", r"\bexit strategy\b"],
+#             "Organization's History": [r"\bhistory\b", r"\borganization background\b"],
+#             "Team Member Descriptions": [r"\bteam\b", r"\bstaff\b", r"\blived experience\b"],
+#         }
+#         self.patterns = {
+#             'grant_application': {
+#                 'header_patterns': [
+#                     r'\*\*([^*]+)\*\*',
+#                     r'^([A-Z][^a-z]*[A-Z])$',
+#                     r'^([A-Z][A-Za-z\s]+)$',
+#                 ],
+#                 'question_patterns': [
+#                     r'^.+\?$',
+#                     r'^\*?Please .+',
+#                     r'^How .+',
+#                     r'^What .+',
+#                     r'^Describe .+',
+#                 ]
+#             }
+#         }
+#     def extract_text(self, file_path: str) -> str:
+#         if file_path.endswith(".docx"):
+#             doc = Document(file_path)
+#             return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
+#         elif file_path.endswith(".pdf"):
+#             text = ""
+#             with fitz.open(file_path) as doc:
+#                 for page in doc:
+#                     text += page.get_text()
+#             return text
+#         elif file_path.endswith(".txt"):
+#             return Path(file_path).read_text()
+#         else:
+#             raise ValueError("Unsupported file format")
+#     def detect_document_type(self, text: str) -> str:
+#         keywords = ['grant', 'funding', 'mission']
+#         return 'grant_application' if sum(k in text.lower() for k in keywords) >= 2 else 'generic'
+#     def extract_headers(self, text: str, doc_type: str) -> List[Dict]:
+#         lines = text.split('\n')
+#         headers = []
+#         patterns = self.patterns.get(doc_type, self.patterns['grant_application'])
+#         for i, line in enumerate(lines):
+#             line = line.strip("* ")
+#             if any(re.match(p, line, re.IGNORECASE) for p in patterns['question_patterns']):
+#                 headers.append({'text': line, 'line_number': i, 'pattern_type': 'question'})
+#             elif any(re.match(p, line) for p in patterns['header_patterns']):
+#                 headers.append({'text': line, 'line_number': i, 'pattern_type': 'header'})
+#         return headers
+#     def chunk_by_headers(self, text: str, headers: List[Dict], max_words=150) -> List[Dict]:
+#         lines = text.split('\n')
+#         chunks = []
+#         if not headers:
+#             words = text.split()
+#             for i in range(0, len(words), max_words):
+#                 piece = ' '.join(words[i:i + max_words])
+#                 chunks.append({
+#                     'chunk_id': len(chunks) + 1,
+#                     'header': '',
+#                     'questions': [],
+#                     'content': piece,
+#                     'pattern_type': 'auto'
+#                 })
+#             return chunks
+#         for i, header in enumerate(headers):
+#             start, end = header['line_number'], headers[i + 1]['line_number'] if i + 1 < len(headers) else len(lines)
+#             content_lines = lines[start + 1:end]
+#             questions = [l.strip() for l in content_lines if l.strip().endswith('?') and len(l.split()) <= 20]
+#             content = ' '.join([l.strip() for l in content_lines if l.strip() and l.strip() not in questions])
+#             for j in range(0, len(content.split()), max_words):
+#                 chunk_text = ' '.join(content.split()[j:j + max_words])
+#                 chunks.append({
+#                     'chunk_id': len(chunks) + 1,
+#                     'header': header['text'] if header['pattern_type'] == 'header' else '',
+#                     'questions': questions if header['pattern_type'] == 'question' else [],
+#                     'content': chunk_text,
+#                     'pattern_type': header['pattern_type'],
+#                     'split_index': j // max_words
+#                 })
+#         return chunks
+#     def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
+#         lower_text = text.lower()
+#         match_scores = defaultdict(int)
+#         for category, patterns in self.category_patterns.items():
+#             for pattern in patterns:
+#                 matches = re.findall(pattern, lower_text)
+#                 match_scores[category] += len(matches)
+#         if not match_scores:
+#             return None if return_first else []
+#         sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
+#         return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
+#     def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
+#         clean = re.sub(r'[^\w\s]', ' ', text.lower())
+#         vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
+#         tfidf = vectorizer.fit_transform([clean])
+#         terms = vectorizer.get_feature_names_out()
+#         scores = tfidf.toarray()[0]
+#         top_terms = [term for term, score in sorted(zip(terms, scores), key=lambda x: -x[1]) if score > 0]
+#         return top_terms[:max_features]
+#     def calculate_confidence_score(self, chunk: Dict) -> float:
+#         score = 0.0
+#         if chunk.get('header'): score += 0.3
+#         if chunk.get('content') and len(chunk['content'].split()) > 20: score += 0.3
+#         if chunk.get('questions'): score += 0.2
+#         return min(score, 1.0)
+#     def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
+#         file_path = Path(file_path)
+#         text = self.extract_text(str(file_path))
+#         doc_type = self.detect_document_type(text)
+#         headers = self.extract_headers(text, doc_type)
+#         raw_chunks = self.chunk_by_headers(text, headers)
+#         final_chunks = []
+#         for chunk in raw_chunks:
+#             full_text = f"{chunk['header']} {' '.join(chunk['questions'])} {chunk['content']}".strip()
+#             category = self.match_category(full_text, return_first=True)
+#             categories = self.match_category(full_text, return_first=False)
+#             embedding = self.embed_model.encode(full_text).tolist()
+#             topics = self.extract_topics_tfidf(full_text)
+#             confidence = self.calculate_confidence_score(chunk)
+#             final_chunks.append({
+#                 "chunk_id": chunk['chunk_id'],
+#                 "text": full_text,
+#                 "embedding": embedding,
+#                 "metadata": {
+#                     **chunk,
+#                     "title": title or file_path.name,
+#                     "category": category,
+#                     "categories": categories,
+#                     "topics": topics,
+#                     "confidence_score": confidence
+#                 }
+#             })
+#         return final_chunks