JaphetHernandez commited on
Commit
dbd2f4b
·
verified ·
1 Parent(s): ff1d6d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -3
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
2
  import streamlit as st
3
  from huggingface_hub import login
4
  import pandas as pd
@@ -10,13 +11,23 @@ login(huggingface_token)
10
  # Cargar el tokenizador y el modelo
11
  model_id = "meta-llama/Llama-3.2-1B"
12
  tokenizer = AutoTokenizer.from_pretrained(model_id)
13
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
14
  tokenizer.pad_token = tokenizer.eos_token
15
 
16
  MAX_INPUT_TOKEN_LENGTH = 10000
17
 
 
 
 
 
 
 
 
 
 
 
18
  def generate_response(input_text, temperature=0.7, max_new_tokens=20):
19
- input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)
20
 
21
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
22
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -32,7 +43,7 @@ def generate_response(input_text, temperature=0.7, max_new_tokens=20):
32
  top_p=0.9,
33
  temperature=temperature,
34
  num_return_sequences=3,
35
- eos_token_id=tokenizer.eos_token_id # Cambiado a un entero
36
  )
37
 
38
  try:
 
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
2
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch, dispatch_model, infer_auto_device_map
3
  import streamlit as st
4
  from huggingface_hub import login
5
  import pandas as pd
 
11
  # Cargar el tokenizador y el modelo
12
  model_id = "meta-llama/Llama-3.2-1B"
13
  tokenizer = AutoTokenizer.from_pretrained(model_id)
14
+ model = AutoModelForCausalLM.from_pretrained(model_id) #, device_map="auto")
15
  tokenizer.pad_token = tokenizer.eos_token
16
 
17
  MAX_INPUT_TOKEN_LENGTH = 10000
18
 
19
+ # Cargar el modelo con disk_offload
20
+ with init_empty_weights():
21
+ model = AutoModelForCausalLM.from_config(model_id)
22
+
23
+ device_map = infer_auto_device_map(model, max_memory={"disk": "2GiB"}, no_split_module_classes=["LlamaDecoderLayer"])
24
+ model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, offload_folder="offload_dir")
25
+
26
+ MAX_INPUT_TOKEN_LENGTH = 10000
27
+
28
+
29
  def generate_response(input_text, temperature=0.7, max_new_tokens=20):
30
+ input_ids = tokenizer.encode(input_text, return_tensors='pt').to("cpu") # Usar 'cpu' para mantener la compatibilidad
31
 
32
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
33
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
 
43
  top_p=0.9,
44
  temperature=temperature,
45
  num_return_sequences=3,
46
+ eos_token_id=tokenizer.eos_token_id
47
  )
48
 
49
  try: