sharpenb commited on
Commit
1f465d5
·
verified ·
1 Parent(s): 48cf990

da65b3b1c34a40604ca2ed176b1de57c9c38f9ae973abd92e092d98335f13f9f

Browse files
Files changed (5) hide show
  1. README.md +4 -4
  2. config.json +3 -3
  3. generation_config.json +1 -1
  4. model.safetensors +2 -2
  5. smash_config.json +10 -25
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
3
- base_model: HuggingFaceTB/SmolLM2-1.7B-Instruct
4
  metrics:
5
  - memory_disk
6
  - memory_inference
@@ -52,7 +52,7 @@ tags:
52
 
53
  You can run the smashed model with these steps:
54
 
55
- 0. Check requirements from the original repo HuggingFaceTB/SmolLM2-1.7B-Instruct installed. In particular, check python, cuda, and transformers versions.
56
  1. Make sure that you have installed quantization related packages.
57
  ```bash
58
  pip install transformers accelerate bitsandbytes>0.37.0
@@ -63,7 +63,7 @@ You can run the smashed model with these steps:
63
 
64
 
65
  model = AutoModelForCausalLM.from_pretrained("PrunaAI/HuggingFaceTB-SmolLM2-1.7B-Instruct-bnb-4bit-smashed", trust_remote_code=True, device_map='auto')
66
- tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
67
 
68
  input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
69
 
@@ -77,7 +77,7 @@ The configuration info are in `smash_config.json`.
77
 
78
  ## Credits & License
79
 
80
- The license of the smashed model follows the license of the original model. Please check the license of the original model HuggingFaceTB/SmolLM2-1.7B-Instruct before using this model which provided the base model. The license of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
81
 
82
  ## Want to compress other models?
83
 
 
1
  ---
2
  thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
3
+ base_model: ORIGINAL_REPO_NAME
4
  metrics:
5
  - memory_disk
6
  - memory_inference
 
52
 
53
  You can run the smashed model with these steps:
54
 
55
+ 0. Check requirements from the original repo ORIGINAL_REPO_NAME installed. In particular, check python, cuda, and transformers versions.
56
  1. Make sure that you have installed quantization related packages.
57
  ```bash
58
  pip install transformers accelerate bitsandbytes>0.37.0
 
63
 
64
 
65
  model = AutoModelForCausalLM.from_pretrained("PrunaAI/HuggingFaceTB-SmolLM2-1.7B-Instruct-bnb-4bit-smashed", trust_remote_code=True, device_map='auto')
66
+ tokenizer = AutoTokenizer.from_pretrained("ORIGINAL_REPO_NAME")
67
 
68
  input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
69
 
 
77
 
78
  ## Credits & License
79
 
80
+ The license of the smashed model follows the license of the original model. Please check the license of the original model ORIGINAL_REPO_NAME before using this model which provided the base model. The license of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
81
 
82
  ## Want to compress other models?
83
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/home/ubuntu/.cache/pruna/tmpbiu3po2h6i9trd70",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
@@ -41,14 +41,14 @@
41
  "rope_scaling": null,
42
  "rope_theta": 130000,
43
  "tie_word_embeddings": true,
44
- "torch_dtype": "float16",
45
  "transformers.js_config": {
46
  "kv_cache_dtype": {
47
  "fp16": "float16",
48
  "q4f16": "float16"
49
  }
50
  },
51
- "transformers_version": "4.46.3",
52
  "use_cache": true,
53
  "vocab_size": 49152
54
  }
 
1
  {
2
+ "_name_or_path": "/tmp/models/tmpwv8vhyngdrgkv1ch",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
41
  "rope_scaling": null,
42
  "rope_theta": 130000,
43
  "tie_word_embeddings": true,
44
+ "torch_dtype": "bfloat16",
45
  "transformers.js_config": {
46
  "kv_cache_dtype": {
47
  "fp16": "float16",
48
  "q4f16": "float16"
49
  }
50
  },
51
+ "transformers_version": "4.48.2",
52
  "use_cache": true,
53
  "vocab_size": 49152
54
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 2,
6
- "transformers_version": "4.46.3"
7
  }
 
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 2,
6
+ "transformers_version": "4.48.2"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:705a3816f413e14fb47db8da7423c590c1972c0f4cd29ba6c60d3f1deab19bf7
3
- size 1107606904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62d057cb8b22ccaed925cc822b17c891138c20e8189cb27d2098ed2e46b00015
3
+ size 1107607120
smash_config.json CHANGED
@@ -1,27 +1,11 @@
1
  {
2
- "comp_cgenerate_active": false,
3
- "comp_ctranslate_active": false,
4
- "comp_cwhisper_active": false,
5
- "comp_diffusers2_active": false,
6
- "comp_flux_caching_active": false,
7
- "comp_ifw_active": false,
8
- "comp_ipex_llm_active": false,
9
- "comp_onediff_active": false,
10
- "comp_step_caching_active": false,
11
- "comp_torch_compile_active": false,
12
- "comp_ws2t_active": false,
13
- "comp_x-fast_active": false,
14
- "prune_torch-structured_active": false,
15
- "prune_torch-unstructured_active": false,
16
- "quant_aqlm_active": false,
17
- "quant_awq_active": false,
18
- "quant_gptq_active": false,
19
- "quant_half_active": false,
20
- "quant_hqq_active": false,
21
- "quant_llm-int8_active": true,
22
- "quant_quanto_active": false,
23
- "quant_torch_dynamic_active": false,
24
- "quant_torch_static_active": false,
25
  "quant_llm-int8_compute_dtype": "bfloat16",
26
  "quant_llm-int8_double_quant": false,
27
  "quant_llm-int8_enable_fp32_cpu_offload": false,
@@ -31,8 +15,9 @@
31
  "quant_llm-int8_weight_bits": 4,
32
  "max_batch_size": 1,
33
  "device": "cuda",
34
- "cache_dir": "/home/ubuntu/.cache/pruna/tmpbiu3po2h",
35
  "task": "",
36
  "save_load_fn": "llm-int8",
37
- "save_load_fn_args": {}
 
38
  }
 
1
  {
2
+ "batchers": null,
3
+ "cachers": null,
4
+ "compilers": null,
5
+ "distillers": null,
6
+ "pruners": null,
7
+ "quantizers": "llm-int8",
8
+ "recoverers": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "quant_llm-int8_compute_dtype": "bfloat16",
10
  "quant_llm-int8_double_quant": false,
11
  "quant_llm-int8_enable_fp32_cpu_offload": false,
 
15
  "quant_llm-int8_weight_bits": 4,
16
  "max_batch_size": 1,
17
  "device": "cuda",
18
+ "cache_dir": "/tmp/models/tmpwv8vhyng",
19
  "task": "",
20
  "save_load_fn": "llm-int8",
21
+ "save_load_fn_args": {},
22
+ "api_key": null
23
  }