import re class TextProcessor: @staticmethod def clean_text(text: str) -> str: """Clean and normalize text content with improved handling""" if not text: return "" # Improved text cleaning text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text) text = re.sub(r'\s+', ' ', text) text = text.encode('ascii', 'ignore').decode('ascii') # Better character handling return text.strip() @staticmethod def format_paper(title: str, abstract: str, max_length: int = 1000) -> str: """Format paper information with improved structure""" title = TextProcessor.clean_text(title) abstract = TextProcessor.clean_text(abstract) if len(abstract) > max_length: abstract = abstract[:max_length-3] + "..." return f"""Title: {title}\nAbstract: {abstract}\n---"""