qwen-finance / app.py
FINGU-AI's picture
Create app.py
7d244cb verified
raw
history blame
1.76 kB
import gradio as gr
import spaces
import os
import spaces
import torch
import random
import time
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
# Set an environment variable
HF_TOKEN = os.environ.get("HF_TOKEN", None)
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
model_id = 'FINGU-AI/Finance-OrpoMistral-7B' #attn_implementation="flash_attention_2",
model = AutoModelForCausalLM.from_pretrained(model_id,attn_implementation="sdpa", torch_dtype= torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.to('cuda')
# terminators = [
# tokenizer.eos_token_id,
# tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]
generation_params = {
'max_new_tokens': 1000,
'use_cache': True,
'do_sample': True,
'temperature': 0.7,
'top_p': 0.9,
'top_k': 50,
}
@spaces.GPU
def inference(query):
messages = [
{"role": "system", "content": """You are a friendly AI assistant named Grinda, specialized in assisting users with trade, stock-related queries. Your tasks include providing insightful suggestions, tips, and winning trade strategies."""},
{"role": "user", "content": f"{query}"},
]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(tokenized_chat, **generation_params)
decoded_outputs = tokenizer.batch_decode(outputs)
assistant_response = decoded_outputs[0].split("Assistant:")[-1].strip()
return assistant_response
def response(message, history):
text = inference(message)
for i in range(len(text)):
time.sleep(0.01)
yield text[: i + 1]
gr.ChatInterface(response).launch()