|
import numpy as np |
|
import pandas as pd |
|
|
|
from languages import en, ru |
|
|
|
|
|
category_init = "label" |
|
certainty_init = "score" |
|
|
|
|
|
def filter_results(results: pd.DataFrame, top_k=0.95): |
|
certs = results[certainty_init].to_numpy() |
|
cum_certs = certs.cumsum() |
|
do_no_show = cum_certs < top_k |
|
first_not_to_show_id = np.argmin(do_no_show) + 1 |
|
results = results.iloc[:first_not_to_show_id] |
|
results = results[results[certainty_init] >= 0.01] |
|
return results |
|
|
|
|
|
def process_keys(results: pd.DataFrame, lang): |
|
category = {en: "Category", ru: "Категория"} |
|
certainty = {en: "Certainty", ru: "Уверенность"} |
|
results = results.rename( |
|
columns={ |
|
category_init: category.get(lang, category[en]), |
|
certainty_init: certainty.get(lang, certainty[en]), |
|
} |
|
) |
|
return results |
|
|
|
|
|
def process_categories(results, lang): |
|
categories = { |
|
en: { |
|
"math": "Math", |
|
"astro-ph": "Astrophysics", |
|
"cond-mat": "Condensed matter physics", |
|
"hep-ph": "High energy physics -- Phenomenology", |
|
"physics": "Physics", |
|
"hep-th": "High energy physics -- Theory", |
|
"cs": "Computer Science", |
|
"quant-ph": "Quantum physics", |
|
"gr-qc": "General Relativity and Quantum Cosmology", |
|
"math-ph": "Mathematical Physics", |
|
"nucl-th": "Nuclear Theory", |
|
"eess": "Electrical Engineering and Systems Science", |
|
"q-bio": "Quantitative Biology", |
|
"nlin": "Nonlinear Sciences", |
|
"stat": "Statistics", |
|
"hep-lat": "High Energy Physics - Lattice", |
|
"hep-ex": "High Energy Physics - Experiment", |
|
"nucl-ex": "Nuclear Experiment", |
|
"econ": "Economins", |
|
"q-alg": "Quantum Algebra", |
|
"q-fin": "Quantitative Finance", |
|
"alg-geom": "Algebraic Geometry", |
|
"supr-con": "Superconductivity", |
|
"chao-dyn": "Chaotic dynamics", |
|
"dg-ga": "Differential Geometry", |
|
"funct-an": "Functional analysis", |
|
"atom-ph": "Atomic physics", |
|
"chem-ph": "Chemical Physics", |
|
"ao-sci": "Atmospheric and Oceanic Physics", |
|
"acc-phys": "Accelerator Physics", |
|
"bayes-an": "Bayesian statistics", |
|
"plasm-ph": "Plasma Physics", |
|
}, |
|
ru: { |
|
"math": "Математика", |
|
"astro-ph": "Астрофизика", |
|
"cond-mat": "Физика конденсированного состояния", |
|
"hep-ph": "Физика элементарных частиц -- Феноменология", |
|
"physics": "Физика", |
|
"hep-th": "Физика элементарных частиц -- Теория", |
|
"cs": "Компьютерные науки", |
|
"quant-ph": "Квантовая физика", |
|
"gr-qc": "Общая теория относительности и квантовая космология", |
|
"math-ph": "Математическая физика", |
|
"nucl-th": "Ядерная физика", |
|
"eess": "Электротехника и системоведение", |
|
"q-bio": "Количественная биология", |
|
"nlin": "Нелинейные науки", |
|
"stat": "Статистика", |
|
"hep-lat": "Физика элементарных частиц -- Решетки", |
|
"hep-ex": "Экспериментальная физика элементарных частиц", |
|
"nucl-ex": "Ядерный эксперимент", |
|
"econ": "Экономика", |
|
"q-alg": "Квантовая алгебра", |
|
"q-fin": "Количественные финансы", |
|
"alg-geom": "Алгебраическая геометрия", |
|
"supr-con": "Сверхпроводимость", |
|
"chao-dyn": "Теория хаоса", |
|
"dg-ga": "Дифференциальная геометрия", |
|
"funct-an": "Функциональный анализ", |
|
"atom-ph": "Атомная физика", |
|
"chem-ph": "Химическая физика", |
|
"ao-sci": "Физика атмосферы и океана", |
|
"acc-phys": "Физика ускорителей", |
|
"bayes-an": "Байесовская статистика", |
|
"plasm-ph": "Физика плазмы", |
|
}, |
|
} |
|
|
|
def process_category(category): |
|
if "." in category: |
|
category = category[: category.index(".")] |
|
return categories.get(lang, {}).get(category, category) |
|
|
|
results[category_init] = results[category_init].apply(process_category) |
|
return results |
|
|
|
|
|
def process_certainities(results): |
|
results[certainty_init] = results[certainty_init].apply( |
|
lambda certainty: "{0:0.2f}%".format(100 * certainty) |
|
) |
|
return results |
|
|
|
|
|
def process_results(results, lang): |
|
results = pd.DataFrame(results) |
|
results = process_categories(results, lang) |
|
results = results.groupby(by=category_init, as_index=False).sum() |
|
results = results.sort_values(by=[certainty_init], ascending=False) |
|
results = filter_results(results) |
|
results = process_certainities(results) |
|
results = process_keys(results, lang) |
|
return results |
|
|