Spaces:
Sleeping
Sleeping
from dataclasses import dataclass | |
from datetime import datetime | |
from typing import List, Optional, Any, Dict | |
# 修改后的数据类(添加 Optional 和默认值) | |
class Author: | |
_id: Optional[str] = None | |
name: Optional[str] = None | |
hidden: Optional[bool] = None | |
class Paper: | |
id: Optional[str] = None | |
authors: List[Author] = None | |
publishedAt: Optional[datetime] = None | |
title: Optional[str] = None | |
summary: Optional[str] = None | |
upvotes: Optional[int] = None | |
discussionId: Optional[str] = None | |
class SubmittedBy: | |
_id: Optional[str] = None | |
avatarUrl: Optional[str] = None | |
fullname: Optional[str] = None | |
name: Optional[str] = None | |
type: Optional[str] = None | |
isPro: Optional[bool] = None | |
isHf: Optional[bool] = None | |
isMod: Optional[bool] = None | |
followerCount: Optional[int] = None | |
class Article: | |
paper: Optional[Paper] = None | |
publishedAt: Optional[datetime] = None | |
title: Optional[str] = None | |
thumbnail: Optional[str] = None | |
numComments: Optional[int] = None | |
submittedBy: Optional[SubmittedBy] = None | |
isAuthorParticipating: Optional[bool] = None | |
def safe_get(data: Dict, *keys: str) -> Any: | |
"""安全获取嵌套字典值""" | |
for key in keys: | |
data = data.get(key, {}) if isinstance(data, dict) else None | |
return data if data != {} else None | |
def parse_article(data: Dict[str, Any]) -> Article: | |
"""容错式解析函数""" | |
def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]: | |
"""安全解析时间""" | |
if not dt_str: | |
return None | |
try: | |
if dt_str.endswith('Z'): | |
dt_str = dt_str[:-1] + '+00:00' | |
return datetime.fromisoformat(dt_str) | |
except ValueError: | |
return None | |
# 解析作者列表 | |
authors = [] | |
for author_data in safe_get(data, "paper", "authors") or []: | |
authors.append(Author( | |
_id=author_data.get("_id"), | |
name=author_data.get("name"), | |
hidden=author_data.get("hidden") | |
)) | |
# 解析论文 | |
paper = Paper( | |
id=safe_get(data, "paper", "id"), | |
authors=authors, | |
publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")), | |
title=safe_get(data, "paper", "title"), | |
summary=safe_get(data, "paper", "summary"), | |
upvotes=safe_get(data, "paper", "upvotes"), | |
discussionId=safe_get(data, "paper", "discussionId") | |
) if safe_get(data, "paper") else None | |
# 解析提交者 | |
submitted_by_data = safe_get(data, "submittedBy") | |
submitted_by = SubmittedBy( | |
_id=submitted_by_data.get("_id") if submitted_by_data else None, | |
avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None, | |
fullname=submitted_by_data.get("fullname") if submitted_by_data else None, | |
name=submitted_by_data.get("name") if submitted_by_data else None, | |
type=submitted_by_data.get("type") if submitted_by_data else None, | |
isPro=submitted_by_data.get("isPro") if submitted_by_data else None, | |
isHf=submitted_by_data.get("isHf") if submitted_by_data else None, | |
isMod=submitted_by_data.get("isMod") if submitted_by_data else None, | |
followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None | |
) if submitted_by_data else None | |
# 构建最终对象 | |
return Article( | |
paper=paper, | |
publishedAt=parse_datetime(data.get("publishedAt")), | |
title=data.get("title"), | |
thumbnail=data.get("thumbnail"), | |
numComments=data.get("numComments"), | |
submittedBy=submitted_by, | |
isAuthorParticipating=data.get("isAuthorParticipating") | |
) | |
# 使用示例 | |
if __name__ == "__main__": | |
import json | |
from rich import print | |
# 假设您的原始数据保存在 article.json 文件中 | |
with open("article.json") as f: | |
raw_data = json.load(f) | |
articles = [] | |
for raw_article in raw_data: | |
article = parse_article(raw_article) | |
articles.append(article) | |
print(articles[0]) | |
print(len(articles)) | |