Spaces:
Sleeping
Sleeping
File size: 1,896 Bytes
055a9c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# =============================================================================
# core/preprocess.py
# =============================================================================
import re
import unicodedata
from config import MambaConfig
from typing import List, Dict, Any
class TextPreprocessor:
def __init__(self, config: MambaConfig):
self.config = config
self.max_length = config.max_seq_len
def clean_text(self, text: str) -> str:
"""Basic text cleaning"""
# Normalize unicode
text = unicodedata.normalize('NFKC', text)
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove control characters except newlines and tabs
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
return text.strip()
def chunk_text(self, text: str, chunk_size: int = None) -> List[str]:
"""Split text into chunks for distributed processing"""
if chunk_size is None:
chunk_size = self.max_length // 2
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
if current_length + len(word) + 1 > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = len(word)
else:
current_chunk.append(word)
current_length += len(word) + 1
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def preprocess_batch(self, texts: List[str]) -> List[str]:
"""Preprocess a batch of texts"""
return [self.clean_text(text) for text in texts]
|