MaxLSB commited on
Commit
852d26e
·
verified ·
1 Parent(s): 36942d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -26
app.py CHANGED
@@ -1,47 +1,67 @@
1
  import os
2
- import gradio as gr
3
- from huggingface_hub import InferenceClient
4
 
 
 
 
 
 
 
 
5
 
6
- hf_token = os.environ["HUGGINGFACEHUB_API_TOKEN"]
7
- client = InferenceClient("MaxLSB/LeCarnet-8M", token=hf_token)
8
 
 
 
 
 
9
 
10
  def respond(
11
- prompt,
12
  chat_history,
13
- max_tokens,
14
- temperature,
15
- top_p,
16
  ):
17
- response = ""
18
 
19
- for token in client.text_generation(
20
- prompt=prompt,
 
 
 
 
 
 
 
 
21
  max_new_tokens=max_tokens,
 
22
  temperature=temperature,
23
  top_p=top_p,
24
- stream=True,
25
- ):
26
- response += token
27
- yield response
 
28
 
 
 
 
 
 
29
 
 
30
  demo = gr.ChatInterface(
31
- respond,
32
  additional_inputs=[
33
- gr.Slider(minimum=1, maximum=512, value=512, step=1, label="Max new tokens"),
34
- gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
35
- gr.Slider(
36
- minimum=0.1,
37
- maximum=1.0,
38
- value=0.95,
39
- step=0.05,
40
- label="Top-p (nucleus sampling)",
41
- ),
42
  ],
 
 
43
  )
44
 
45
-
46
  if __name__ == "__main__":
47
  demo.launch()
 
1
  import os
2
+ import threading
 
3
 
4
+ import gradio as gr
5
+ import torch
6
+ from transformers import (
7
+ AutoModelForCausalLM,
8
+ AutoTokenizer,
9
+ TextIteratorStreamer,
10
+ )
11
 
12
+ MODEL_NAME = "MaxLSB/LeCarnet-8M"
 
13
 
14
+ # Load tokenizer & model locally
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
16
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
17
+ model.eval()
18
 
19
  def respond(
20
+ prompt: str,
21
  chat_history,
22
+ max_tokens: int,
23
+ temperature: float,
24
+ top_p: float,
25
  ):
26
+ inputs = tokenizer(prompt, return_tensors="pt")
27
 
28
+ # Text streamer to get one token at a time
29
+ streamer = TextIteratorStreamer(
30
+ tokenizer,
31
+ skip_prompt=True,
32
+ skip_special_tokens=True,
33
+ )
34
+
35
+ generate_kwargs = dict(
36
+ **inputs,
37
+ streamer=streamer,
38
  max_new_tokens=max_tokens,
39
+ do_sample=True,
40
  temperature=temperature,
41
  top_p=top_p,
42
+ )
43
+
44
+ # Kick off generation in background
45
+ thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
46
+ thread.start()
47
 
48
+ # Stream out partial completions
49
+ accumulated = ""
50
+ for new_text in streamer:
51
+ accumulated += new_text
52
+ yield accumulated
53
 
54
+ # Wire it up in Gradio
55
  demo = gr.ChatInterface(
56
+ fn=respond,
57
  additional_inputs=[
58
+ gr.Slider(1, 512, value=128, step=1, label="Max new tokens"),
59
+ gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
60
+ gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top‑p"),
 
 
 
 
 
 
61
  ],
62
+ title="Prefix Completion Demo",
63
+ description="Type the beginning of a sentence and watch the model finish it.",
64
  )
65
 
 
66
  if __name__ == "__main__":
67
  demo.launch()