File size: 4,172 Bytes
c7478e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Any, Dict

# 修改后的数据类(添加 Optional 和默认值)


@dataclass
class Author:
    _id: Optional[str] = None
    name: Optional[str] = None
    hidden: Optional[bool] = None


@dataclass
class Paper:
    id: Optional[str] = None
    authors: List[Author] = None
    publishedAt: Optional[datetime] = None
    title: Optional[str] = None
    summary: Optional[str] = None
    upvotes: Optional[int] = None
    discussionId: Optional[str] = None


@dataclass
class SubmittedBy:
    _id: Optional[str] = None
    avatarUrl: Optional[str] = None
    fullname: Optional[str] = None
    name: Optional[str] = None
    type: Optional[str] = None
    isPro: Optional[bool] = None
    isHf: Optional[bool] = None
    isMod: Optional[bool] = None
    followerCount: Optional[int] = None


@dataclass
class Article:
    paper: Optional[Paper] = None
    publishedAt: Optional[datetime] = None
    title: Optional[str] = None
    thumbnail: Optional[str] = None
    numComments: Optional[int] = None
    submittedBy: Optional[SubmittedBy] = None
    isAuthorParticipating: Optional[bool] = None


def safe_get(data: Dict, *keys: str) -> Any:
    """安全获取嵌套字典值"""
    for key in keys:
        data = data.get(key, {}) if isinstance(data, dict) else None
    return data if data != {} else None


def parse_article(data: Dict[str, Any]) -> Article:
    """容错式解析函数"""

    def parse_datetime(dt_str: Optional[str]) -> Optional[datetime]:
        """安全解析时间"""
        if not dt_str:
            return None
        try:
            if dt_str.endswith('Z'):
                dt_str = dt_str[:-1] + '+00:00'
            return datetime.fromisoformat(dt_str)
        except ValueError:
            return None

    # 解析作者列表
    authors = []
    for author_data in safe_get(data, "paper", "authors") or []:
        authors.append(Author(
            _id=author_data.get("_id"),
            name=author_data.get("name"),
            hidden=author_data.get("hidden")
        ))

    # 解析论文
    paper = Paper(
        id=safe_get(data, "paper", "id"),
        authors=authors,
        publishedAt=parse_datetime(safe_get(data, "paper", "publishedAt")),
        title=safe_get(data, "paper", "title"),
        summary=safe_get(data, "paper", "summary"),
        upvotes=safe_get(data, "paper", "upvotes"),
        discussionId=safe_get(data, "paper", "discussionId")
    ) if safe_get(data, "paper") else None

    # 解析提交者
    submitted_by_data = safe_get(data, "submittedBy")
    submitted_by = SubmittedBy(
        _id=submitted_by_data.get("_id") if submitted_by_data else None,
        avatarUrl=submitted_by_data.get("avatarUrl") if submitted_by_data else None,
        fullname=submitted_by_data.get("fullname") if submitted_by_data else None,
        name=submitted_by_data.get("name") if submitted_by_data else None,
        type=submitted_by_data.get("type") if submitted_by_data else None,
        isPro=submitted_by_data.get("isPro") if submitted_by_data else None,
        isHf=submitted_by_data.get("isHf") if submitted_by_data else None,
        isMod=submitted_by_data.get("isMod") if submitted_by_data else None,
        followerCount=submitted_by_data.get("followerCount") if submitted_by_data else None
    ) if submitted_by_data else None

    # 构建最终对象
    return Article(
        paper=paper,
        publishedAt=parse_datetime(data.get("publishedAt")),
        title=data.get("title"),
        thumbnail=data.get("thumbnail"),
        numComments=data.get("numComments"),
        submittedBy=submitted_by,
        isAuthorParticipating=data.get("isAuthorParticipating")
    )


# 使用示例
if __name__ == "__main__":
    import json
    from rich import print

    # 假设您的原始数据保存在 article.json 文件中
    with open("article.json") as f:
        raw_data = json.load(f)

    articles = []

    for raw_article in raw_data:
        article = parse_article(raw_article)
        articles.append(article)

    print(articles[0])
    print(len(articles))