qwen-finance / app.py
FINGU-AI's picture
Update app.py
fb9d705 verified
raw
history blame
1.64 kB
import gradio as gr
import spaces
import os
import spaces
import torch
import random
import time
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
# Set an environment variable
HF_TOKEN = os.environ.get("HF_TOKEN", None)
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
model_id = 'FINGU-AI/Finance-OrpoMistral-7B' #attn_implementation="flash_attention_2",
model = AutoModelForCausalLM.from_pretrained(model_id,attn_implementation="sdpa", torch_dtype= torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.to('cuda')
# terminators = [
# tokenizer.eos_token_id,
# tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]
generation_params = {
'max_new_tokens': 1000,
'use_cache': True,
'do_sample': True,
'temperature': 0.7,
'top_p': 0.9,
# 'top_k': 50,
}
@spaces.GPU
def inference(query):
messages = [
{"role": "system", "content": """You are a friendly chatbot who always responds in the style of a trader."""},
{"role": "user", "content": f"{query}"},
]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(tokenized_chat, **generation_params)
decoded_outputs = tokenizer.batch_decode(outputs)
assistant_response = decoded_outputs[0].split("Assistant:")[-1].strip()
return assistant_response
def response(message, history):
text = inference(message)
for i in range(len(text)):
time.sleep(0.01)
yield text[: i + 1]
gr.ChatInterface(response).launch()