Spaces:
Sleeping
Sleeping
File size: 4,172 Bytes
c7478e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Any, Dict
# 修改后的数据类(添加 Optional 和默认值)
@dataclass
class Author:
_id: Optional[str] = None
name: Optional[str] = None
hidden: Optional[bool] = None
@dataclass
class Paper:
id: Optional[str] = None
authors: List[Author] = None
publishedAt: Optional[datetime] = None
title: Optional[str] = None
summary: Optional[str] = None
upvotes: Optional[int] = None
discussionId: Optional[str] = None
@dataclass
class SubmittedBy:
_id: Optional[str] = None
avatarUrl: Optional[str] = None
fullname: Optional[str] = None
name: Optional[str] = None
type: Optional[str] = None
isPro: Optional[bool] = None
isHf: Optional[bool] = None
isMod: Optional[bool] = None
followerCount: Optional[int] = None
@dataclass
class Article:
paper: Optional[Paper] = None
publishedAt: Optional[datetime] = None
title: Optional[str] = None
thumbnail: Optional[str] = None
numComments: Optional[int] = None
submittedBy: Optional[SubmittedBy] = None
isAuthorParticipating: Optional[bool] = None
def safe_get(data: Dict, *keys: str) -> Any:
"""安全获取嵌套字典值"""
for key in keys:
data = data.get(key, {}) if isinstance(data, dict) else None
return data if data != {} else None
def parse_article(data: Dict[str, Any]) -> Article:
"""容错式解析函数"""
def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
"""安全解析时间"""
if not dt_str:
return None
try:
if dt_str.endswith('Z'):
dt_str = dt_str[:-1] + '+00:00'
return datetime.fromisoformat(dt_str)
except ValueError:
return None
# 解析作者列表
authors = []
for author_data in safe_get(data, "paper", "authors") or []:
authors.append(Author(
_id=author_data.get("_id"),
name=author_data.get("name"),
hidden=author_data.get("hidden")
))
# 解析论文
paper = Paper(
id=safe_get(data, "paper", "id"),
authors=authors,
publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
title=safe_get(data, "paper", "title"),
summary=safe_get(data, "paper", "summary"),
upvotes=safe_get(data, "paper", "upvotes"),
discussionId=safe_get(data, "paper", "discussionId")
) if safe_get(data, "paper") else None
# 解析提交者
submitted_by_data = safe_get(data, "submittedBy")
submitted_by = SubmittedBy(
_id=submitted_by_data.get("_id") if submitted_by_data else None,
avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
name=submitted_by_data.get("name") if submitted_by_data else None,
type=submitted_by_data.get("type") if submitted_by_data else None,
isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
) if submitted_by_data else None
# 构建最终对象
return Article(
paper=paper,
publishedAt=parse_datetime(data.get("publishedAt")),
title=data.get("title"),
thumbnail=data.get("thumbnail"),
numComments=data.get("numComments"),
submittedBy=submitted_by,
isAuthorParticipating=data.get("isAuthorParticipating")
)
# 使用示例
if __name__ == "__main__":
import json
from rich import print
# 假设您的原始数据保存在 article.json 文件中
with open("article.json") as f:
raw_data = json.load(f)
articles = []
for raw_article in raw_data:
article = parse_article(raw_article)
articles.append(article)
print(articles[0])
print(len(articles))
|