ama-autism / utils /text_processor.py
wakeupmh's picture
refactor: structure
3af593c
raw
history blame
908 Bytes
import re
class TextProcessor:
@staticmethod
def clean_text(text: str) -> str:
"""Clean and normalize text content with improved handling"""
if not text:
return ""
# Improved text cleaning
text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = text.encode('ascii', 'ignore').decode('ascii') # Better character handling
return text.strip()
@staticmethod
def format_paper(title: str, abstract: str, max_length: int = 1000) -> str:
"""Format paper information with improved structure"""
title = TextProcessor.clean_text(title)
abstract = TextProcessor.clean_text(abstract)
if len(abstract) > max_length:
abstract = abstract[:max_length-3] + "..."
return f"""Title: {title}\nAbstract: {abstract}\n---"""