PaperClassification / results.py
Valeriy Sinyukov
Don't show categories with less then 1% confidence
bd0a951
raw
history blame contribute delete
5.49 kB
import numpy as np
import pandas as pd
from languages import en, ru
category_init = "label"
certainty_init = "score"
def filter_results(results: pd.DataFrame, top_k=0.95):
certs = results[certainty_init].to_numpy()
cum_certs = certs.cumsum()
do_no_show = cum_certs < top_k
first_not_to_show_id = np.argmin(do_no_show) + 1
results = results.iloc[:first_not_to_show_id]
results = results[results[certainty_init] >= 0.01]
return results
def process_keys(results: pd.DataFrame, lang):
category = {en: "Category", ru: "Категория"}
certainty = {en: "Certainty", ru: "Уверенность"}
results = results.rename(
columns={
category_init: category.get(lang, category[en]),
certainty_init: certainty.get(lang, certainty[en]),
}
)
return results
def process_categories(results, lang):
categories = {
en: {
"math": "Math",
"astro-ph": "Astrophysics",
"cond-mat": "Condensed matter physics",
"hep-ph": "High energy physics -- Phenomenology",
"physics": "Physics",
"hep-th": "High energy physics -- Theory",
"cs": "Computer Science",
"quant-ph": "Quantum physics",
"gr-qc": "General Relativity and Quantum Cosmology",
"math-ph": "Mathematical Physics",
"nucl-th": "Nuclear Theory",
"eess": "Electrical Engineering and Systems Science",
"q-bio": "Quantitative Biology",
"nlin": "Nonlinear Sciences",
"stat": "Statistics",
"hep-lat": "High Energy Physics - Lattice",
"hep-ex": "High Energy Physics - Experiment",
"nucl-ex": "Nuclear Experiment",
"econ": "Economins",
"q-alg": "Quantum Algebra",
"q-fin": "Quantitative Finance",
"alg-geom": "Algebraic Geometry",
"supr-con": "Superconductivity",
"chao-dyn": "Chaotic dynamics",
"dg-ga": "Differential Geometry",
"funct-an": "Functional analysis",
"atom-ph": "Atomic physics",
"chem-ph": "Chemical Physics",
"ao-sci": "Atmospheric and Oceanic Physics",
"acc-phys": "Accelerator Physics",
"bayes-an": "Bayesian statistics",
"plasm-ph": "Plasma Physics",
},
ru: {
"math": "Математика",
"astro-ph": "Астрофизика",
"cond-mat": "Физика конденсированного состояния",
"hep-ph": "Физика элементарных частиц -- Феноменология",
"physics": "Физика",
"hep-th": "Физика элементарных частиц -- Теория",
"cs": "Компьютерные науки",
"quant-ph": "Квантовая физика",
"gr-qc": "Общая теория относительности и квантовая космология",
"math-ph": "Математическая физика",
"nucl-th": "Ядерная физика",
"eess": "Электротехника и системоведение",
"q-bio": "Количественная биология",
"nlin": "Нелинейные науки",
"stat": "Статистика",
"hep-lat": "Физика элементарных частиц -- Решетки",
"hep-ex": "Экспериментальная физика элементарных частиц",
"nucl-ex": "Ядерный эксперимент",
"econ": "Экономика",
"q-alg": "Квантовая алгебра",
"q-fin": "Количественные финансы",
"alg-geom": "Алгебраическая геометрия",
"supr-con": "Сверхпроводимость",
"chao-dyn": "Теория хаоса",
"dg-ga": "Дифференциальная геометрия",
"funct-an": "Функциональный анализ",
"atom-ph": "Атомная физика",
"chem-ph": "Химическая физика",
"ao-sci": "Физика атмосферы и океана",
"acc-phys": "Физика ускорителей",
"bayes-an": "Байесовская статистика",
"plasm-ph": "Физика плазмы",
},
}
def process_category(category):
if "." in category:
category = category[: category.index(".")]
return categories.get(lang, {}).get(category, category)
results[category_init] = results[category_init].apply(process_category)
return results
def process_certainities(results):
results[certainty_init] = results[certainty_init].apply(
lambda certainty: "{0:0.2f}%".format(100 * certainty)
)
return results
def process_results(results, lang):
results = pd.DataFrame(results)
results = process_categories(results, lang)
results = results.groupby(by=category_init, as_index=False).sum()
results = results.sort_values(by=[certainty_init], ascending=False)
results = filter_results(results)
results = process_certainities(results)
results = process_keys(results, lang)
return results