Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List | |
| from models.paper import Paper | |
| class TextProcessor: | |
| def clean_text(text: str) -> str: | |
| """Clean and normalize text content""" | |
| # Remove special characters but keep basic punctuation | |
| text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text) | |
| return text.strip() | |
| def format_paper(self, title: str, abstract: str) -> str: | |
| """Format paper title and abstract for context""" | |
| title = self.clean_text(title) | |
| abstract = self.clean_text(abstract) | |
| return f"Title: {title}\nAbstract: {abstract}" | |
| def create_context(self, papers: List[Paper]) -> str: | |
| """Create a context string from a list of papers""" | |
| context_parts = [] | |
| for i, paper in enumerate(papers, 1): | |
| # Format the paper information with clear structure | |
| paper_context = f""" | |
| Research Paper {i}: | |
| Title: {self.clean_text(paper.title)} | |
| Key Points: | |
| - Authors: {paper.authors if paper.authors else 'Not specified'} | |
| - Publication Date: {paper.publication_date} | |
| - Source: {paper.source} | |
| Main Findings: | |
| {self.format_abstract(paper.abstract)} | |
| """ | |
| context_parts.append(paper_context) | |
| # Join all paper contexts with clear separation | |
| full_context = "\n" + "="*50 + "\n".join(context_parts) | |
| return full_context | |
| def format_abstract(self, abstract: str) -> str: | |
| """Format abstract into bullet points for better readability""" | |
| # Clean the abstract | |
| clean_abstract = self.clean_text(abstract) | |
| # Split into sentences | |
| sentences = [s.strip() for s in clean_abstract.split('.') if s.strip()] | |
| # Format as bullet points, combining short sentences | |
| bullet_points = [] | |
| current_point = [] | |
| for sentence in sentences: | |
| current_point.append(sentence) | |
| if len(' '.join(current_point)) > 100 or sentence == sentences[-1]: | |
| bullet_points.append('- ' + '. '.join(current_point) + '.') | |
| current_point = [] | |
| return '\n'.join(bullet_points) | |