File size: 908 Bytes
3af593c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re

class TextProcessor:
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize text content with improved handling"""
        if not text:
            return ""
        
        # Improved text cleaning
        text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.encode('ascii', 'ignore').decode('ascii')  # Better character handling
        
        return text.strip()

    @staticmethod
    def format_paper(title: str, abstract: str, max_length: int = 1000) -> str:
        """Format paper information with improved structure"""
        title = TextProcessor.clean_text(title)
        abstract = TextProcessor.clean_text(abstract)
        
        if len(abstract) > max_length:
            abstract = abstract[:max_length-3] + "..."
        
        return f"""Title: {title}\nAbstract: {abstract}\n---"""