Spaces:

readomni
/

literate

Running

App Files Files Community

literate / _next /static /chunks /app /methodology /page-9ca0cab8e776add5.js

ekojs's picture

feat: LiTERatE v0.1

0d8af24 verified 3 months ago

19.9 kB

(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[910],{1965:(e,n,t)=>{"use strict";t.r(n),t.d(n,{default:()=>l});var s=t(1500),a=t(3870),r=t(432);function i(e){let{chunks:n,initialChunkIndex:t=0}=e,[a,i]=(0,r.useState)(t),o=n[a],l=e=>{"prev"===e&&a>0?i(a-1):"next"===e&&a<n.length-1&&i(a+1)};return o?(0,s.jsxs)("div",{className:"border rounded-lg overflow-hidden",children:[(0,s.jsxs)("div",{className:"bg-muted p-4 flex items-center justify-between",children:[(0,s.jsxs)("h4",{className:"font-medium",children:["Chunk ",a+1," of ",n.length]}),(0,s.jsxs)("div",{className:"flex gap-2",children:[(0,s.jsx)("button",{onClick:()=>l("prev"),disabled:0===a,className:"px-3 py-1 rounded bg-primary/10 hover:bg-primary/20 disabled:opacity-50 disabled:cursor-not-allowed",children:"Previous"}),(0,s.jsx)("button",{onClick:()=>l("next"),disabled:a===n.length-1,className:"px-3 py-1 rounded bg-primary/10 hover:bg-primary/20 disabled:opacity-50 disabled:cursor-not-allowed",children:"Next"})]})]}),(0,s.jsxs)("div",{className:"grid grid-cols-1 md:grid-cols-2 gap-4 p-4",children:[(0,s.jsxs)("div",{className:"space-y-4",children:[(0,s.jsxs)("div",{children:[(0,s.jsx)("h5",{className:"text-sm font-semibold mb-2 text-muted-foreground",children:"Source Text"}),(0,s.jsx)("div",{className:"bg-card p-4 rounded border border-border",children:(0,s.jsx)("div",{dangerouslySetInnerHTML:{__html:((e,n)=>{let t=e.split("\n"),s=[...n].sort((e,n)=>n.raw.length-e.raw.length);return t.map(e=>{let n=e;for(let e of s){let t=RegExp(e.raw,"g");n=n.replace(t,'<span class="bg-amber-100 dark:bg-amber-300/30 px-1 rounded cursor-pointer group relative">\n '.concat(e.raw,'\n <span class="absolute hidden group-hover:block bg-background text-popover-foreground p-2 rounded-lg shadow-lg -top-10 left-0 z-10 text-sm min-w-24 px-4 text-center">\n ').concat(e.translation,"\n </span>\n </span>"))}return n}).join("<br>")})(o.text,o.glossary)}})})]}),o.prev&&(0,s.jsxs)("div",{children:[(0,s.jsx)("h5",{className:"text-sm font-semibold mb-2 text-muted-foreground",children:"Previous Context"}),(0,s.jsx)("div",{className:"bg-muted/50 p-4 rounded border border-border text-sm",children:(0,s.jsx)("div",{dangerouslySetInnerHTML:{__html:o.prev.split("\n").join("<br>")}})})]}),o.next&&(0,s.jsxs)("div",{children:[(0,s.jsx)("h5",{className:"text-sm font-semibold mb-2 text-muted-foreground",children:"Next Context"}),(0,s.jsx)("div",{className:"bg-muted/50 p-4 rounded border border-border text-sm",children:(0,s.jsx)("div",{dangerouslySetInnerHTML:{__html:o.next.split("\n").join("<br>")}})})]}),(0,s.jsxs)("div",{children:[(0,s.jsx)("h5",{className:"text-sm font-semibold mb-2 text-muted-foreground",children:"Glossary"}),(0,s.jsx)("div",{className:"bg-card p-4 rounded border border-border",children:(0,s.jsxs)("table",{className:"w-full text-sm",children:[(0,s.jsx)("thead",{children:(0,s.jsxs)("tr",{className:"border-b",children:[(0,s.jsx)("th",{className:"text-left pb-2",children:"Term"}),(0,s.jsx)("th",{className:"text-left pb-2",children:"Translation"}),(0,s.jsx)("th",{className:"text-left pb-2",children:"Gender"})]})}),(0,s.jsx)("tbody",{children:o.glossary.map((e,n)=>(0,s.jsxs)("tr",{className:"border-b last:border-0",children:[(0,s.jsx)("td",{className:"py-2",children:e.raw}),(0,s.jsx)("td",{className:"py-2",children:e.translation}),(0,s.jsx)("td",{className:"py-2",children:e.gender})]},n))})]})})]})]}),(0,s.jsx)("div",{className:"space-y-4",children:(0,s.jsxs)("div",{children:[(0,s.jsx)("h5",{className:"text-sm font-semibold mb-2 text-muted-foreground",children:"Human Translation"}),(0,s.jsx)("div",{className:"bg-card p-4 rounded border border-border",children:(0,s.jsx)("div",{dangerouslySetInnerHTML:{__html:o.translation.split("\n").join("<br>")}})})]})})]})]}):(0,s.jsx)("div",{className:"text-center p-8 text-muted-foreground",children:"No chunks available to display"})}let o=[{id:"01954343-4346-7f3b-9129-34c648f6e738",source:"c9c4c1d8-29a4-4eb7-9e9c-6537c2b19958",text:'长庚蓦地一转身："备纸笔。"\n\n侍卫连忙追上去："殿下，你的手……"\n\n长庚闻言一顿，抄起顾昀落下的酒壶，面无表情地将那一壶烈酒全冲到了双手的伤口上，本来已经结痂的伤口再次被冲出血水来，他从怀中取出一块帕子，浑不在意地一裹。\n\n此时京城中，谁也没料到一个老太监的死竟然引发了这样一场轩然大波。\n\n谭鸿飞压抑二十年的冤屈爆发，大约已经失心疯了，先是派兵围了王国舅府邸，得知那老东西竟将老婆孩子抛下，进宫躲风头去了，便立刻掉头，悍然对上了赶来救场的御林军。\n\n御林军素日与北大营一主内、一主外，同为京畿重地的最后一道防线，是抬头不见低头见的交情，御林军主要由京城里走门路吃皇粮的少爷兵和从北大营抽调选拔的精英两部分组成，前者早就吓得尿了裤子，根本指望不上，后者虽然有本事，但骤然与"娘家"对上，一时间也是进退维谷，正如长庚预料，很快便溃不成军。',glossary:[{gender:"masculine",raw:"长庚",translation:"Chang Geng"},{gender:"neuter",raw:"殿下",translation:"Your Highness"},{gender:"masculine",raw:"顾昀",translation:"Gu Yun"},{gender:"masculine",raw:"谭鸿飞",translation:"Tan Hongfei"},{gender:"masculine",raw:"王国舅",translation:"Imperial Uncle Wang"},{gender:"neuter",raw:"御林军",translation:"Imperial Guard"},{gender:"neuter",raw:"北大营",translation:"Northern Camp"}],translator:"Human",translation:'Chang Geng spun around. "Prepare a brush and paper."\n\n"Your Highness, your hands..." The guard chased after him.\n\nChang Geng paused, picked up Gu Yun\'s abandoned jar of wine, and, with no change in expression, poured the whole jar of strong liquor over the wounds on his hands. The cuts, which had already begun to scab over, bled again with the rush of liquid. Chang Geng carelessly retrieved a handkerchief from his lapels and wrapped them tight.\n\nIn the capital, no one expected that an old eunuch\'s death would raise such a storm of controversy.\n\nThe resentment Tan Hongfei had suppressed for twenty years erupted—he had very likely already lost his mind. He first sent soldiers to surround Imperial Uncle Wang\'s estate. Upon learning that the old bastard had abandoned his wife and children to cower within the palace, he did an about-face and brazenly turned his blade on the Imperial Guard who had rushed to the scene.\n\nThe Imperial Guard and the Northern Camp had always been the last lines of defense for the capital, one within and one without, and the two constantly crossed paths. The Imperial Guard was by and large made up of two groups: young-master soldiers benefitting from nepotism and living off the imperial coffers, and elite soldiers selected from the Northern Camp. The former had already pissed their pants in terror and could not be relied on. The latter were skilled, but, stuck in the impossible position of drawing blades against their maiden family, quickly crumpled. Just as Chang Geng had predicted, in no time at all, the Imperial Guard was defeated.',prev:'飞奔而去。\n\n长庚一直盯着他的背影，直到目力无可及，他突然闭了闭眼，几不可闻地喃喃叫了一声："子熹……"\n\n一边的侯府侍卫没听清，疑惑道："殿下说什么？"',next:"起鸢楼的笙歌还在绕梁不休，温热的花酒白雾未消，四九城中已经炸了锅。\n\n谭鸿飞带人逼至宫禁之外，"},{id:"01954343-434b-7d57-8b2b-ff7bf247a244",source:"f9bc4531-31e0-45d9-9d44-168ca519f3b6",text:"「ピクシー01了解。気象情報は? 改善する見込みはないのか」\n\nそれでもうんざりとするような報告を、うんざりとするほど後方から聞かされる気分は最悪だった。コンタクトなしということは、ずっと飛び続けている我々がさらに索敵しなくてはならないという事を意味する。\n\n雨雲を突破しようにも随分と高度を上げねばならない。結局、中途半端に濡れながらの飛行だ。外殻で水をはじくとはいえ、気分の良いものではない。\n\n「ウルバン・コントロールよりの戦域管制情報を送る。......当分は無理だな。陸軍さんに同情したくなるぞ。きっと全員この寒さで地獄を見ているに違いない」\n\n「戦区全域にて豪雨に暴風。現在二級洪水警報並びに飛行制限勧告発令中? 了解した。作戦参加中の他部隊は?」\n\nぱっと、受信したデータを確認しつつ、天候が著しく悪化しつつあるということを確認してターニャは気が遠くなりかけていた。いっそ、飛行制限勧告が飛行禁止勧告になれば帰還できるのだが。",glossary:[{gender:"neuter",raw:"ピクシー01",translation:"Pixie 01"},{gender:"neuter",raw:"ウルバン・コントロール",translation:"Urban Control"}],translator:"Human",translation:'"Pixie 01, roger. How about the weather? Can we expect it to improve?"\n\nStill, I\'m fed up with these tiresome reports from the rear. No reports of contact means that even though we\'ve been flying all this time, we have to keep searching.\n\nIf we wanted to get above the rain clouds, we would have to climb awfully high. Instead, we just get wet. Even though our defensive shells repel water, getting rained on doesn\'t do much for the mood.\n\n"Sending war zone data from Urban Control... Looks like it won\'t for a while. I feel for the ground troops. They must be in hell with this cold."\n\n"The whole combat zone is hard rain and storm winds. A level-two flood warning and flight restrictions are being issued? Got it. How are the other units in the operation doing?" Tanya checks the data as it suddenly comes in and gets confirmation that the weather is only getting worse, which boggles her mind. That said, if the flight warnings become no-fly advisories, they can return to base.',prev:"さすがに、そこまで方向音痴の部下を持ったつもりが無いのは唯一の救いだろう。\n\n「管制より、ピクシー。現在コンタクト報告なし」",next:"「キィエール軍港より捜索遊撃任務に第一戦隊が出港中。空軍は特殊強行偵察中隊が索敵任務に出撃。"}];function l(){return(0,s.jsx)(a.A,{children:(0,s.jsxs)("div",{className:"bg-card text-card-foreground p-8 rounded-lg shadow-md border border-border",children:[(0,s.jsx)("h2",{className:"text-3xl font-bold mb-6",children:"Methodology"}),(0,s.jsxs)("div",{className:"space-y-8 text-foreground",children:[(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Overview"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"LiTERatE (Literary Translation Evaluation and Rating Ensemble) is a benchmark for evaluating machine translation systems on literary text. Unlike traditional machine translation benchmarks that focus on news articles, technical documentation, or general text, LiTERatE specifically targets literary translation, which presents unique challenges due to its creative and nuanced nature."})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Dataset Composition"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"Our dataset consists of English human translations of novels from Chinese, Japanese, and Korean (CJK) languages. We include a diverse range of translations:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"Published professional translations"}),(0,s.jsx)("li",{children:"Translations from online publishers"}),(0,s.jsx)("li",{children:"Amateur translations"})]}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"While published professional translations make up the bulk of our samples to ensure high quality, we deliberately include lower-quality translations for two important reasons:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"To ensure diversity, as they often encompass less-translated genres and story types"}),(0,s.jsx)("li",{children:"To test system robustness against varying human translation quality"})]})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Evaluation Units"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"Our evaluation is conducted on chunks of 200-500 CJK characters as the basic unit. To ensure a fair and consistent evaluation environment, we:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"Extract terminology used in the original human translation"}),(0,s.jsx)("li",{children:"Provide these terms as additional input for all systems"}),(0,s.jsx)("li",{children:"Include gender information for each term (neuter, feminine, or masculine)"}),(0,s.jsx)("li",{children:"Provide approximately 60 CJK characters from previous and next chunks as context"})]}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"This approach allows us to evaluate not only translation quality but also term adherence and contextual understanding."})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"System Input Format"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"All evaluated systems (except the Google NMT baseline) receive the following inputs:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"The text chunk to be translated (200-500 CJK characters)"}),(0,s.jsx)("li",{children:"Previous and next chunks as context (approximately 80 CJK characters each)"}),(0,s.jsx)("li",{children:"A glossary of terms with their translations and gender information"})]}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"The Google NMT baseline, which serves as a traditional machine translation reference point, receives only line-by-line input without additional context or terminology data."})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Evaluation Process"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"Our evaluation process follows these key steps:"}),(0,s.jsxs)("ol",{className:"list-decimal pl-6 mt-2 space-y-3 text-muted-foreground",children:[(0,s.jsxs)("li",{children:[(0,s.jsx)("strong",{children:"Chunk Curation:"})," We carefully select and prepare text chunks from our dataset, ensuring they represent diverse literary styles, genres, and translation challenges."]}),(0,s.jsxs)("li",{children:[(0,s.jsx)("strong",{children:"Translation Generation:"})," We ask different systems to produce translations based on the raw text, extracted terminology/glossary, and surrounding context."]}),(0,s.jsxs)("li",{children:[(0,s.jsx)("strong",{children:"Human Reference:"}),' Each chunk has a corresponding human translation that serves as a reference point (though not necessarily the "gold standard").']}),(0,s.jsxs)("li",{children:[(0,s.jsx)("strong",{children:"Head-to-Head Comparison:"})," Our LLM ensemble judges compare each system's translation against the human translation in a direct comparison."]}),(0,s.jsxs)("li",{children:[(0,s.jsx)("strong",{children:"Scoring:"})," Based on these comparisons, we calculate win rates that represent how often each system's translations are judged to be equal to or better than human translations."]})]}),(0,s.jsxs)("div",{className:"mt-6",children:[(0,s.jsx)("h4",{className:"text-lg font-medium mb-4",children:"Example Chunks"}),(0,s.jsx)("p",{className:"text-muted-foreground mb-4",children:"Below are examples of chunks from our dataset. You can explore the source text, glossary terms, context, and human translations. Hover over highlighted terms to see their translations."}),(0,s.jsx)(i,{chunks:o})]})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Evaluation Approach"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"Our benchmark uses an ensemble of Large Language Models (LLMs) as judges to evaluate translations. The evaluation is conducted as head-to-head comparisons between machine translations and human translations."}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"To ensure the highest possible accuracy in our evaluation system, we conducted an extensive calibration experiment:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"Multiple human annotators evaluated several hundred translation pairs"}),(0,s.jsx)("li",{children:"We focused on decisive human verdicts—cases where multiple annotators agreed on a clear winner"}),(0,s.jsx)("li",{children:"This approach addresses the inherently subjective nature of literary translation evaluation, which typically has low inter-annotator agreement"})]})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Judge Ensemble"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"Our experiments revealed that using multiple frontier LLMs as judges, each evaluating different aspects of translation quality, and then ensembling their verdicts produces the most accurate results."}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"This ensemble approach achieves 82% accuracy when compared to decisive human judgments. For comparison, a single LLM judge would only achieve approximately 60% accuracy."})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Scoring Methodology"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:'For each evaluation unit, our judge ensemble determines whether the machine translation or the human translation is superior, or if the comparison is too close to call ("not-sure").'}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"Points are assigned as follows:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"Machine translation wins: 1 point"}),(0,s.jsx)("li",{children:'Tie or "not-sure": 0.5 points'}),(0,s.jsx)("li",{children:"Human translation wins: 0 points"})]}),(0,s.jsx)("p",{className:"text-muted-foreground mt-3",children:"The final score for each system is calculated as the average of these points multiplied by 100, representing the system's win rate against human translators. A score of 50 indicates parity with human translation quality."})]}),(0,s.jsxs)("section",{children:[(0,s.jsx)("h3",{className:"text-xl font-semibold mb-3",children:"Limitations"}),(0,s.jsx)("p",{className:"text-muted-foreground",children:"While our methodology represents a significant advancement in evaluating literary translation, we acknowledge several limitations:"}),(0,s.jsxs)("ul",{className:"list-disc pl-6 mt-2 space-y-1 text-muted-foreground",children:[(0,s.jsx)("li",{children:"Literary translation evaluation is inherently subjective with low inter-annotator agreement"}),(0,s.jsx)("li",{children:"Our current dataset is limited to Chinese, Japanese, and Korean source languages"}),(0,s.jsx)("li",{children:"The evaluation focuses on chunk-level translation rather than document-level coherence"}),(0,s.jsx)("li",{children:"Even with our ensemble approach, there remains an 18% gap between our automated evaluation and decisive human judgment"})]})]})]})]})})}},6504:(e,n,t)=>{Promise.resolve().then(t.bind(t,1965))}},e=>{var n=n=>e(e.s=n);e.O(0,[475,870,815,702,358],()=>n(6504)),_N_E=e.O()}]);