Chengxb888 commited on
Commit
bfd07fd
·
verified ·
1 Parent(s): 0ed1ae4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -14,15 +14,22 @@ async def root():
14
  @app.post("/hello/")
15
  def say_hello(msg: Annotated[str, Form()]):
16
  print("model")
17
- tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
18
- model = AutoModelForCausalLM.from_pretrained(
19
- "google/gemma-2b-it",
20
- device_map="cpu",
21
- torch_dtype=torch.bfloat16
22
- )
23
- print("token & msg")
 
 
 
 
 
24
  input_ids = tokenizer(msg, return_tensors="pt").to("cpu")
25
  print("output")
26
- outputs = model.generate(**input_ids, max_length=500)
 
 
27
  print("complete")
28
  return {"message": tokenizer.decode(outputs[0])}
 
14
  @app.post("/hello/")
15
  def say_hello(msg: Annotated[str, Form()]):
16
  print("model")
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer
18
+ checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct"
19
+
20
+ device = "cpu" # for GPU usage or "cpu" for CPU usage
21
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
22
+ # for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
23
+ model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
24
+
25
+ messages = [{"role": "user", "content": "things about elasticsearch"}]
26
+ input_text=tokenizer.apply_chat_template(messages, tokenize=False)
27
+ print(input_text)
28
+
29
  input_ids = tokenizer(msg, return_tensors="pt").to("cpu")
30
  print("output")
31
+ inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
32
+ outputs = model.generate(inputs, max_new_tokens=256, temperature=0.6, top_p=0.92, do_sample=True)
33
+
34
  print("complete")
35
  return {"message": tokenizer.decode(outputs[0])}