Spaces:
Running
Running
File size: 5,784 Bytes
447c09c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
"""
Utility functions for text processing and embeddings.
This module contains utility functions for text processing, tokenization,
chunking, and embedding operations.
"""
from typing import List, Dict, Any, Tuple
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from configs.config import Config
def estimate_tokens(text: str) -> int:
"""
Estimate the number of tokens in a text (rough approximation).
Args:
text: Input text
Returns:
Estimated number of tokens
"""
return len(text) // 4
def process_pdf_file(file_path: str) -> List[Document]:
"""
Load a PDF file and extract its text with metadata.
Args:
file_path: Path to the PDF file
Returns:
List of Document objects with metadata
Raises:
FileNotFoundError: If the file doesn't exist
"""
import os
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
loader = PyMuPDFLoader(file_path)
documents = loader.load()
return documents
def chunk_text(documents: List[Document], max_length: int = None) -> List[Dict[str, Any]]:
"""
Split documents into chunks with metadata.
Args:
documents: List of Document objects
max_length: Maximum chunk length in tokens
Returns:
List of chunk dictionaries with text and metadata
"""
if max_length is None:
max_length = Config.DEFAULT_CHUNK_SIZE
chunks = []
for doc in documents:
text = doc.page_content
metadata = doc.metadata
paragraphs = text.split("\n\n")
current_chunk = ""
current_metadata = metadata.copy()
for paragraph in paragraphs:
# Skip very short paragraphs
if len(paragraph.strip()) < Config.MIN_PARAGRAPH_LENGTH:
continue
if estimate_tokens(current_chunk + paragraph) <= max_length // 4:
current_chunk += paragraph + "\n\n"
else:
# Only add chunks with meaningful content
if current_chunk.strip() and len(current_chunk.strip()) > Config.MIN_CHUNK_LENGTH:
chunks.append({
"text": current_chunk.strip(),
"metadata": current_metadata
})
current_chunk = paragraph + "\n\n"
# Add the last chunk if it has meaningful content
if current_chunk.strip() and len(current_chunk.strip()) > Config.MIN_CHUNK_LENGTH:
chunks.append({
"text": current_chunk.strip(),
"metadata": current_metadata
})
return chunks
def create_embeddings(chunks: List[Dict[str, Any]], model: SentenceTransformer) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
"""
Create embeddings for a list of chunk texts.
Args:
chunks: List of chunk dictionaries
model: SentenceTransformer model
Returns:
Tuple of (embeddings array, chunks)
"""
texts = [chunk["text"] for chunk in chunks]
embeddings = model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
return embeddings.cpu().numpy(), chunks
def filter_relevant_chunks(chunks_data: List[Tuple], threshold: float = None) -> List[Tuple]:
"""
Filter chunks based on similarity threshold.
Args:
chunks_data: List of (text, score, metadata) tuples
threshold: Similarity threshold (lower is more similar)
Returns:
Filtered list of chunks
"""
if threshold is None:
threshold = Config.SIMILARITY_THRESHOLD
return [chunk for chunk in chunks_data if len(chunk) >= 3 and chunk[1] < threshold]
def prepare_context_from_chunks(context_chunks: List[Tuple], max_tokens: int = None) -> str:
"""
Prepare context string from chunk data.
Args:
context_chunks: List of (text, score, metadata) tuples
max_tokens: Maximum tokens for context
Returns:
Formatted context string
"""
if max_tokens is None:
max_tokens = Config.MAX_CONTEXT_TOKENS
# Sort chunks by relevance (lower distance = more relevant)
sorted_chunks = sorted(context_chunks, key=lambda x: x[1]) if context_chunks else []
# Filter out chunks with very high distance scores (low similarity)
relevant_chunks = filter_relevant_chunks(sorted_chunks)
context = ""
total_tokens = 0
for chunk, _, _ in relevant_chunks:
if chunk and chunk.strip():
chunk_tokens = estimate_tokens(chunk)
if total_tokens + chunk_tokens <= max_tokens:
context += chunk + "\n\n"
total_tokens += chunk_tokens
else:
break
return context.strip() if context else "No initial context provided from preliminary search."
def validate_chunk_data(chunk_data: Any) -> bool:
"""
Validate chunk data structure.
Args:
chunk_data: Chunk data to validate
Returns:
True if valid, False otherwise
"""
if not isinstance(chunk_data, (list, tuple)):
return False
if len(chunk_data) < 3:
return False
text, score, metadata = chunk_data[0], chunk_data[1], chunk_data[2]
if not isinstance(text, str) or not text.strip():
return False
if not isinstance(score, (int, float)):
return False
if not isinstance(metadata, dict):
return False
return True
|