File size: 2,162 Bytes
3af593c
ee1b548
 
3af593c
 
 
 
ee1b548
 
3af593c
 
ee1b548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3af593c
ee1b548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3af593c
ee1b548
 
 
 
 
3af593c
ee1b548
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
from typing import List
from models.paper import Paper

class TextProcessor:
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize text content"""
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
        return text.strip()
    
    def format_paper(self, title: str, abstract: str) -> str:
        """Format paper title and abstract for context"""
        title = self.clean_text(title)
        abstract = self.clean_text(abstract)
        return f"Title: {title}\nAbstract: {abstract}"
    
    def create_context(self, papers: List[Paper]) -> str:
        """Create a context string from a list of papers"""
        context_parts = []
        
        for i, paper in enumerate(papers, 1):
            # Format the paper information with clear structure
            paper_context = f"""
Research Paper {i}:
Title: {self.clean_text(paper.title)}
Key Points:
- Authors: {paper.authors if paper.authors else 'Not specified'}
- Publication Date: {paper.publication_date}
- Source: {paper.source}

Main Findings:
{self.format_abstract(paper.abstract)}
"""
            context_parts.append(paper_context)
        
        # Join all paper contexts with clear separation
        full_context = "\n" + "="*50 + "\n".join(context_parts)
        
        return full_context
    
    def format_abstract(self, abstract: str) -> str:
        """Format abstract into bullet points for better readability"""
        # Clean the abstract
        clean_abstract = self.clean_text(abstract)
        
        # Split into sentences
        sentences = [s.strip() for s in clean_abstract.split('.') if s.strip()]
        
        # Format as bullet points, combining short sentences
        bullet_points = []
        current_point = []
        
        for sentence in sentences:
            current_point.append(sentence)
            if len(' '.join(current_point)) > 100 or sentence == sentences[-1]:
                bullet_points.append('- ' + '. '.join(current_point) + '.')
                current_point = []
        
        return '\n'.join(bullet_points)