Spaces:
Sleeping
Sleeping
File size: 1,626 Bytes
36942d4 852d26e cee13f4 852d26e 341bd22 852d26e 341bd22 852d26e fc2aea6 852d26e 341bd22 852d26e 341bd22 852d26e 2918965 852d26e 341bd22 852d26e 341bd22 852d26e 341bd22 852d26e 341bd22 852d26e 341bd22 852d26e 341bd22 852d26e 341bd22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import threading
import gradio as gr
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
)
MODEL_NAME = "MaxLSB/LeCarnet-8M"
# Load tokenizer & model locally
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()
def respond(
prompt: str,
chat_history,
max_tokens: int,
temperature: float,
top_p: float,
):
inputs = tokenizer(prompt, return_tensors="pt")
# Text streamer to get one token at a time
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True,
)
generate_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
)
# Kick off generation in background
thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
# Stream out partial completions
accumulated = ""
for new_text in streamer:
accumulated += new_text
yield accumulated
# Wire it up in Gradio
demo = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Slider(1, 512, value=128, step=1, label="Max new tokens"),
gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top‑p"),
],
title="Prefix Completion Demo",
description="Type the beginning of a sentence and watch the model finish it.",
)
if __name__ == "__main__":
demo.launch()
|