File size: 5,491 Bytes
a1ad5de
ce0c8db
 
 
 
 
bd0a951
 
 
 
 
a1ad5de
 
 
 
 
bd0a951
ce0c8db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce32a21
 
bd0a951
ce0c8db
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
import pandas as pd

from languages import en, ru


category_init = "label"
certainty_init = "score"


def filter_results(results: pd.DataFrame, top_k=0.95):
    certs = results[certainty_init].to_numpy()
    cum_certs = certs.cumsum()
    do_no_show = cum_certs < top_k
    first_not_to_show_id = np.argmin(do_no_show) + 1
    results = results.iloc[:first_not_to_show_id]
    results = results[results[certainty_init] >= 0.01]
    return results


def process_keys(results: pd.DataFrame, lang):
    category = {en: "Category", ru: "Категория"}
    certainty = {en: "Certainty", ru: "Уверенность"}
    results = results.rename(
        columns={
            category_init: category.get(lang, category[en]),
            certainty_init: certainty.get(lang, certainty[en]),
        }
    )
    return results


def process_categories(results, lang):
    categories = {
        en: {
            "math": "Math",
            "astro-ph": "Astrophysics",
            "cond-mat": "Condensed matter physics",
            "hep-ph": "High energy physics -- Phenomenology",
            "physics": "Physics",
            "hep-th": "High energy physics -- Theory",
            "cs": "Computer Science",
            "quant-ph": "Quantum physics",
            "gr-qc": "General Relativity and Quantum Cosmology",
            "math-ph": "Mathematical Physics",
            "nucl-th": "Nuclear Theory",
            "eess": "Electrical Engineering and Systems Science",
            "q-bio": "Quantitative Biology",
            "nlin": "Nonlinear Sciences",
            "stat": "Statistics",
            "hep-lat": "High Energy Physics - Lattice",
            "hep-ex": "High Energy Physics - Experiment",
            "nucl-ex": "Nuclear Experiment",
            "econ": "Economins",
            "q-alg": "Quantum Algebra",
            "q-fin": "Quantitative Finance",
            "alg-geom": "Algebraic Geometry",
            "supr-con": "Superconductivity",
            "chao-dyn": "Chaotic dynamics",
            "dg-ga": "Differential Geometry",
            "funct-an": "Functional analysis",
            "atom-ph": "Atomic physics",
            "chem-ph": "Chemical Physics",
            "ao-sci": "Atmospheric and Oceanic Physics",
            "acc-phys": "Accelerator Physics",
            "bayes-an": "Bayesian statistics",
            "plasm-ph": "Plasma Physics",
        },
        ru: {
            "math": "Математика",
            "astro-ph": "Астрофизика",
            "cond-mat": "Физика конденсированного состояния",
            "hep-ph": "Физика элементарных частиц -- Феноменология",
            "physics": "Физика",
            "hep-th": "Физика элементарных частиц -- Теория",
            "cs": "Компьютерные науки",
            "quant-ph": "Квантовая физика",
            "gr-qc": "Общая теория относительности и квантовая космология",
            "math-ph": "Математическая физика",
            "nucl-th": "Ядерная физика",
            "eess": "Электротехника и системоведение",
            "q-bio": "Количественная биология",
            "nlin": "Нелинейные науки",
            "stat": "Статистика",
            "hep-lat": "Физика элементарных частиц -- Решетки",
            "hep-ex": "Экспериментальная физика элементарных частиц",
            "nucl-ex": "Ядерный эксперимент",
            "econ": "Экономика",
            "q-alg": "Квантовая алгебра",
            "q-fin": "Количественные финансы",
            "alg-geom": "Алгебраическая геометрия",
            "supr-con": "Сверхпроводимость",
            "chao-dyn": "Теория хаоса",
            "dg-ga": "Дифференциальная геометрия",
            "funct-an": "Функциональный анализ",
            "atom-ph": "Атомная физика",
            "chem-ph": "Химическая физика",
            "ao-sci": "Физика атмосферы и океана",
            "acc-phys": "Физика ускорителей",
            "bayes-an": "Байесовская статистика",
            "plasm-ph": "Физика плазмы",
        },
    }

    def process_category(category):
        if "." in category:
            category = category[: category.index(".")]
        return categories.get(lang, {}).get(category, category)

    results[category_init] = results[category_init].apply(process_category)
    return results


def process_certainities(results):
    results[certainty_init] = results[certainty_init].apply(
        lambda certainty: "{0:0.2f}%".format(100 * certainty)
    )
    return results


def process_results(results, lang):
    results = pd.DataFrame(results)
    results = process_categories(results, lang)
    results = results.groupby(by=category_init, as_index=False).sum()
    results = results.sort_values(by=[certainty_init], ascending=False)
    results = filter_results(results)
    results = process_certainities(results)
    results = process_keys(results, lang)
    return results