Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
|
|
2 |
import streamlit as st
|
3 |
from huggingface_hub import login
|
4 |
import pandas as pd
|
@@ -10,13 +11,23 @@ login(huggingface_token)
|
|
10 |
# Cargar el tokenizador y el modelo
|
11 |
model_id = "meta-llama/Llama-3.2-1B"
|
12 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
13 |
-
model = AutoModelForCausalLM.from_pretrained(model_id
|
14 |
tokenizer.pad_token = tokenizer.eos_token
|
15 |
|
16 |
MAX_INPUT_TOKEN_LENGTH = 10000
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def generate_response(input_text, temperature=0.7, max_new_tokens=20):
|
19 |
-
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(
|
20 |
|
21 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
22 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
@@ -32,7 +43,7 @@ def generate_response(input_text, temperature=0.7, max_new_tokens=20):
|
|
32 |
top_p=0.9,
|
33 |
temperature=temperature,
|
34 |
num_return_sequences=3,
|
35 |
-
eos_token_id=tokenizer.eos_token_id
|
36 |
)
|
37 |
|
38 |
try:
|
|
|
1 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
2 |
+
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, dispatch_model, infer_auto_device_map
|
3 |
import streamlit as st
|
4 |
from huggingface_hub import login
|
5 |
import pandas as pd
|
|
|
11 |
# Cargar el tokenizador y el modelo
|
12 |
model_id = "meta-llama/Llama-3.2-1B"
|
13 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
14 |
+
model = AutoModelForCausalLM.from_pretrained(model_id) #, device_map="auto")
|
15 |
tokenizer.pad_token = tokenizer.eos_token
|
16 |
|
17 |
MAX_INPUT_TOKEN_LENGTH = 10000
|
18 |
|
19 |
+
# Cargar el modelo con disk_offload
|
20 |
+
with init_empty_weights():
|
21 |
+
model = AutoModelForCausalLM.from_config(model_id)
|
22 |
+
|
23 |
+
device_map = infer_auto_device_map(model, max_memory={"disk": "2GiB"}, no_split_module_classes=["LlamaDecoderLayer"])
|
24 |
+
model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, offload_folder="offload_dir")
|
25 |
+
|
26 |
+
MAX_INPUT_TOKEN_LENGTH = 10000
|
27 |
+
|
28 |
+
|
29 |
def generate_response(input_text, temperature=0.7, max_new_tokens=20):
|
30 |
+
input_ids = tokenizer.encode(input_text, return_tensors='pt').to("cpu") # Usar 'cpu' para mantener la compatibilidad
|
31 |
|
32 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
33 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
|
|
43 |
top_p=0.9,
|
44 |
temperature=temperature,
|
45 |
num_return_sequences=3,
|
46 |
+
eos_token_id=tokenizer.eos_token_id
|
47 |
)
|
48 |
|
49 |
try:
|