Update app.py
Browse filesupdate device use only cuda (gpu)
use additional gpu decorator for any function than loads, runs models to gpu
app.py
CHANGED
@@ -63,11 +63,11 @@ LANGUAGES = {
|
|
63 |
loaded_models = {}
|
64 |
loaded_tokenizers = {}
|
65 |
|
66 |
-
|
67 |
def load_model_and_tokenizer(model_key):
|
68 |
if model_key not in loaded_models:
|
69 |
model_info = MODELS[model_key]
|
70 |
-
device =
|
71 |
model = AutoModelForCausalLM.from_pretrained(
|
72 |
model_info["model_name"],
|
73 |
token=HF_TOKEN,
|
@@ -84,13 +84,13 @@ def load_model_and_tokenizer(model_key):
|
|
84 |
tokenizer.pad_token = tokenizer.eos_token
|
85 |
loaded_tokenizers[model_key] = tokenizer
|
86 |
|
87 |
-
|
88 |
def generate_text(model_choice, prompt, max_length, temperature, top_p, do_sample):
|
89 |
load_model_and_tokenizer(model_choice)
|
90 |
|
91 |
model = loaded_models[model_choice]
|
92 |
tokenizer = loaded_tokenizers[model_choice]
|
93 |
-
device =
|
94 |
|
95 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
|
96 |
|
@@ -136,7 +136,7 @@ def update_language(selected_language):
|
|
136 |
)
|
137 |
|
138 |
|
139 |
-
@spaces.GPU(duration=
|
140 |
def wrapped_generate_text(model_choice, prompt, max_length, temperature, top_p, do_sample):
|
141 |
return generate_text(model_choice, prompt, max_length, temperature, top_p, do_sample)
|
142 |
|
|
|
63 |
loaded_models = {}
|
64 |
loaded_tokenizers = {}
|
65 |
|
66 |
+
@spaces.GPU(duration=240)
|
67 |
def load_model_and_tokenizer(model_key):
|
68 |
if model_key not in loaded_models:
|
69 |
model_info = MODELS[model_key]
|
70 |
+
device = "cuda"
|
71 |
model = AutoModelForCausalLM.from_pretrained(
|
72 |
model_info["model_name"],
|
73 |
token=HF_TOKEN,
|
|
|
84 |
tokenizer.pad_token = tokenizer.eos_token
|
85 |
loaded_tokenizers[model_key] = tokenizer
|
86 |
|
87 |
+
@spaces.GPU(duration=240)
|
88 |
def generate_text(model_choice, prompt, max_length, temperature, top_p, do_sample):
|
89 |
load_model_and_tokenizer(model_choice)
|
90 |
|
91 |
model = loaded_models[model_choice]
|
92 |
tokenizer = loaded_tokenizers[model_choice]
|
93 |
+
device = "cuda"
|
94 |
|
95 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
|
96 |
|
|
|
136 |
)
|
137 |
|
138 |
|
139 |
+
@spaces.GPU(duration=240)
|
140 |
def wrapped_generate_text(model_choice, prompt, max_length, temperature, top_p, do_sample):
|
141 |
return generate_text(model_choice, prompt, max_length, temperature, top_p, do_sample)
|
142 |
|