import gradio as gr import requests from datetime import datetime, timezone from concurrent.futures import ThreadPoolExecutor, as_completed API_URL = "https://huggingface.co/api/daily_papers" REPOS_API_URL_TEMPLATE = "https://huggingface.co/api/arxiv/{arxiv_id}/repos" class PaperManager: def __init__(self, papers_per_page=30): self.papers_per_page = papers_per_page self.current_page = 1 self.papers = [] self.total_pages = 1 self.sort_method = "hot" # Default sort method self.raw_papers = [] # To store fetched data def calculate_score(self, paper): """ Calculate the score of a paper based on upvotes and age. This mimics the "hotness" algorithm used by platforms like Hacker News. """ upvotes = paper.get('paper', {}).get('upvotes', 0) published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat()) try: published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00')) except ValueError: # If parsing fails, use current time to minimize the impact on sorting published_time = datetime.now(timezone.utc) time_diff = datetime.now(timezone.utc) - published_time time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours # Avoid division by zero and apply the hotness formula score = upvotes / ((time_diff_hours + 2) ** 1.5) return score def fetch_repos_counts(self, arxiv_id): """ Fetch the repositories (models, datasets, Spaces) associated with a given arxiv_id. Returns a dictionary with counts for each type. """ try: response = requests.get(REPOS_API_URL_TEMPLATE.format(arxiv_id=arxiv_id)) response.raise_for_status() data = response.json() counts = {'models': 0, 'datasets': 0, 'spaces': 0} for repo in data: repo_type = repo.get('type', '').lower() if repo_type == 'model': counts['models'] += 1 elif repo_type == 'dataset': counts['datasets'] += 1 elif repo_type == 'space': counts['spaces'] += 1 return counts except requests.RequestException as e: print(f"Error fetching repos for arxiv_id {arxiv_id}: {e}") return {'models': 0, 'datasets': 0, 'spaces': 0} except Exception as e: print(f"Unexpected error fetching repos for arxiv_id {arxiv_id}: {e}") return {'models': 0, 'datasets': 0, 'spaces': 0} def fetch_papers(self): try: response = requests.get(f"{API_URL}?limit=100") response.raise_for_status() data = response.json() if not data: print("No data received from API.") return False # Debug: Print keys of the first paper print("Keys in the first paper:", data[0].keys()) self.raw_papers = data # Store raw data # Fetch repos counts concurrently with ThreadPoolExecutor(max_workers=20) as executor: future_to_paper = {} for paper in self.raw_papers: arxiv_id = paper.get('paper', {}).get('arxiv_id', '') if arxiv_id: future = executor.submit(self.fetch_repos_counts, arxiv_id) future_to_paper[future] = paper else: # If no arxiv_id, set counts to zero paper['models'] = 0 paper['datasets'] = 0 paper['spaces'] = 0 for future in as_completed(future_to_paper): paper = future_to_paper[future] counts = future.result() paper['models'] = counts['models'] paper['datasets'] = counts['datasets'] paper['spaces'] = counts['spaces'] self.sort_papers() self.total_pages = max((len(self.papers) + self.papers_per_page - 1) // self.papers_per_page, 1) self.current_page = 1 return True except requests.RequestException as e: print(f"Error fetching papers: {e}") return False except Exception as e: print(f"Unexpected error: {e}") return False def sort_papers(self): if self.sort_method == "hot": self.papers = sorted( self.raw_papers, key=lambda x: self.calculate_score(x), reverse=True ) elif self.sort_method == "new": self.papers = sorted( self.raw_papers, key=lambda x: x.get('published_at', ''), reverse=True ) elif self.sort_method == "most_models": self.papers = sorted( self.raw_papers, key=lambda x: x.get('models', 0), reverse=True ) elif self.sort_method == "most_datasets": self.papers = sorted( self.raw_papers, key=lambda x: x.get('datasets', 0), reverse=True ) elif self.sort_method == "most_spaces": self.papers = sorted( self.raw_papers, key=lambda x: x.get('spaces', 0), reverse=True ) else: # Default to hot if unknown sort method self.papers = sorted( self.raw_papers, key=lambda x: self.calculate_score(x), reverse=True ) def set_sort_method(self, method): valid_methods = ["hot", "new", "most_models", "most_datasets", "most_spaces"] if method not in valid_methods: method = "hot" print(f"Setting sort method to: {method}") self.sort_method = method self.sort_papers() self.current_page = 1 return True # Assume success def format_paper(self, paper, rank): title = paper.get('title', 'No title') paper_id = paper.get('paper', {}).get('id', '') url = f"https://huggingface.co/papers/{paper_id}" authors = ', '.join([author.get('name', '') for author in paper.get('paper', {}).get('authors', [])]) or 'Unknown' upvotes = paper.get('paper', {}).get('upvotes', 0) comments = paper.get('numComments', 0) published_time_str = paper.get('published_at', datetime.now(timezone.utc).isoformat()) try: published_time = datetime.fromisoformat(published_time_str.replace('Z', '+00:00')) except ValueError: published_time = datetime.now(timezone.utc) time_diff = datetime.now(timezone.utc) - published_time time_ago_days = time_diff.days time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today" models = paper.get('models', 0) datasets = paper.get('datasets', 0) spaces = paper.get('spaces', 0) return f"""
Daily Papers |