Spaces:
Build error
Build error
File size: 5,495 Bytes
4ffd659 3885d15 4ffd659 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import getDB from "@/utils/getDB"
import Head from "next/head"
import Link from "next/link"
import { useRouter } from "next/router"
import { useEffect, useMemo, useState } from "react"
// import styles from '@/styles/Home.module.css'
export const getStaticProps = async () => {
const db = await getDB()
const prompts = await db.all(`SELECT * FROM prompts ORDER BY text ASC`)
// get all models that have at least 1 result
const models = await db.all(
`SELECT * FROM models WHERE id IN (SELECT DISTINCT model FROM results) ORDER BY name ASC`
)
return { props: { prompts, models } }
}
export default function Home({ prompts, models }) {
const router = useRouter()
const [viewBy, setViewBy] = useState(router.query.viewBy || "prompt")
const changeView = (viewBy) => {
router.push({ query: { viewBy } })
}
useEffect(() => {
if (router.query.viewBy) setViewBy(router.query.viewBy)
}, [router.query.viewBy])
const types = useMemo(() => {
return Array.from(new Set(prompts.map((p) => p.type)))
}, [prompts])
return (
<>
<Head>
<title>LLM Benchmarks</title>
<meta
name="description"
content="Human-readable benchmarks of 60+ open-source and proprietary LLMs."
/>
<meta name="viewport" content="width=device-width, initial-scale=1" />
</Head>
<main>
<h1>Asking 60+ LLMs a set of 20 questions</h1>
<br />
<p>
Benchmarks like HellaSwag are a bit too abstract for me to get a sense
of how well they perform in real-world workflows.
</p>
<br />
<p>
I had the idea of writing a script that asks prompts testing basic
reasoning, instruction following, and creativity on around 60 models
that I could get my hands on through inferences API.
</p>
<br />
<p>
The script stored all the answers in a SQLite database, and those are
the raw results.
</p>
<br />
<br />
<p>
{`view: `}
<a href="#" onClick={() => changeView("prompt")}>
all prompts
</a>{" "}
/{" "}
<a href="#" onClick={() => changeView("model")}>
all models
</a>
</p>
<br />
{viewBy === "prompt" ? (
<>
{types.map((type, k) => (
<div key={k}>
<p>{type}:</p>
<br />
<ul>
{prompts
.filter((p) => p.type === type)
.map((prompt, i) => (
<li key={i}>
<pre style={{ maxWidth: 800 }}>
{prompt.text}
<br />
<br />
<Link href={`/${prompt.slug}`}>results</Link>
</pre>
</li>
))}
</ul>
</div>
))}
</>
) : (
<ul>
{models.map((model, i) => (
<li key={i}>
{model.name} -{" "}
<Link
href={`/model/${model.api_id.split("/").pop().toLowerCase()}`}
>
results
</Link>
</li>
))}
</ul>
)}
<br />
<br />
<h3>Notes</h3>
<br />
<ul>
<li>
I used a temperature of 0 and a max token limit of 240 for each test
(that's why a lot of answers are cropped). The rest are default
settings.
</li>
<li>
I made this with a mix of APIs from OpenRouter, TogetherAI, OpenAI,
Cohere, Aleph Alpha & AI21.
</li>
<li>
<b>This is imperfect.</b> I want to improve this by using better
stop sequences and prompt formatting tailored to each model. But
hopefully it can already make picking models a bit easier.
</li>
<li>
Ideas for the future: public votes to compute an ELO rating, compare
2 models side by side, community-submitted prompts (open to
suggestions)
</li>
<li>
Prompt suggestions, feedback or say hi: vince [at] llmonitor.com
</li>
<li>
{`Shameless plug: I'm building an `}
<a href="https://github.com/llmonitor/llmonitor" target="_blank">
open-source observability tool for AI devs.
</a>
</li>
</ul>
<br />
<br />
<table style={{ maxWidth: 600 }}>
<th>
<p>
Edit: as this got popular, I added an email form to receive
notifications for future benchmark results:
</p>
<iframe
src="https://embeds.beehiiv.com/65bd6af1-2dea-417a-baf2-b65bc27e1610?slim=true"
height="52"
frameborder="0"
scrolling="no"
style={{
width: 400,
border: "none",
transform: "scale(0.8)",
transformOrigin: "left",
}}
></iframe>
<br />
<small>(no spam, max 1 email per month)</small>
</th>
</table>
<br />
</main>
</>
)
}
|