ORLM

Running on Zero

tangzhy commited on Jul 25, 2024

Commit

5420f2a

verified ·

1 Parent(s): 5ab915e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -25,23 +25,21 @@ MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # quantization_config = BitsAndBytesConfig(
 #     load_in_4bit=True,
 #     bnb_4bit_compute_dtype=torch.bfloat16,
 #     bnb_4bit_use_double_quant=True,
 #     bnb_4bit_quant_type= "nf4")
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
-    # torch_dtype=torch.bfloat16,
-    # attn_implementation="flash_attention_2",
-    quantization_config=quantization_config,
 )
 model.eval()
@@ -63,7 +61,7 @@ def generate(
     input_ids = tokenized_example.input_ids
     input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,

 DEFAULT_MAX_NEW_TOKENS = 4096
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # quantization_config = BitsAndBytesConfig(
 #     load_in_4bit=True,
 #     bnb_4bit_compute_dtype=torch.bfloat16,
 #     bnb_4bit_use_double_quant=True,
 #     bnb_4bit_quant_type= "nf4")
+# quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    # quantization_config=quantization_config,
 )
 model.eval()
     input_ids = tokenized_example.input_ids
     input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=50.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,