sd-inf commited on
Commit
8339e3f
·
verified ·
1 Parent(s): 1ff9e78

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +109 -0
README.md ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+ # Kiwi 4B MoE Code Expert
6
+
7
+ This is the code expert from the Kiwi 1 3x4b MoE. Using this model for non-programming problems will result in suboptimal results.
8
+
9
+ # Kiwi 1 Mixture of Experts
10
+
11
+ Kiwi 1 MoE is comprised of three experts, a math, coding and general expert resulitng in a 3x4b model resulting in a 12B model when ran all together. Experts are uploaded seperately so they can be used independently of the system.
12
+
13
+ Each expert is based off Qwen3 4B, finetuned for their task and basic generalization for better problem solving skills.
14
+
15
+ ## Running Kiwi 1 MoE
16
+
17
+ You can run Kiwi 1 MoE like so:
18
+
19
+ ```
20
+ import torch
21
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
22
+
23
+ # Paths to checkpoints
24
+ GATING_CKPT = r"LucidityAI/Kiwi-4b-General"
25
+ EXPERT_CKPTS = {
26
+ 1: r"LucidityAI/Kiwi-4b-General", # General expert
27
+ 2: r"LucidityAI/Kiwi-4b-Math", # Math expert
28
+ 3: r"LucidityAI/Kiwi-4b-Coder" # Code expert
29
+ }
30
+
31
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
+
33
+ bnb_config = BitsAndBytesConfig(
34
+ # You can add any needed quants here.
35
+ )
36
+
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(GATING_CKPT, trust_remote_code=True)
39
+
40
+ # Load models in 8bit
41
+ def load_8bit_model(ckpt_path):
42
+ return AutoModelForCausalLM.from_pretrained(
43
+ ckpt_path, quantization_config=bnb_config, trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.float16
44
+ )
45
+
46
+
47
+ model_gate = load_8bit_model(GATING_CKPT)
48
+ models_expert = {
49
+ 1: model_gate, # Gating model / General expert
50
+ 2: load_8bit_model(EXPERT_CKPTS[2]), # Math expert
51
+ 3: load_8bit_model(EXPERT_CKPTS[3]) # Code expert
52
+ }
53
+
54
+ def route_input(text: str) -> int:
55
+ prompt = (
56
+ "<|im_start|>system\n"
57
+ "You are a router. Based on the following input, respond with ONLY one digit, nothing else:\n"
58
+ "1 for general questions,\n"
59
+ "2 for math related questions,\n"
60
+ "3 for coding related questions.\n"
61
+ "Any task that involves programming at all should be allocated to the code expert, same for math. For questions that use both topics, select based on the topic thats most used.\n"
62
+ "<|im_end|>\n"
63
+ "<|im_start|>user\n"
64
+ f"{text} /no_think\n"
65
+ "<|im_end|>\n"
66
+ "<|im_start|>assistant\n"
67
+ "<think>\n</think>\n"
68
+ )
69
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
70
+ output = model_gate.generate(input_ids=input_ids, max_new_tokens=15)
71
+ response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
72
+ # filter out non-digits
73
+ response = "".join([c for c in response if c.isdigit()])
74
+ try:
75
+ choice = int(response.strip()[0])
76
+ except Exception:
77
+ choice = 1
78
+ if choice not in EXPERT_CKPTS:
79
+ choice = 1
80
+ return choice
81
+
82
+ def generate_response(text: str, max_new_tokens: int = 200) -> (str, int):
83
+ choice = route_input(text)
84
+ model = models_expert[choice]
85
+ prompt = (
86
+ "<|im_start|>user\n"
87
+ f"{text}\n"
88
+ "<|im_end|>\n"
89
+ "<|im_start|>assistant\n"
90
+ "<think>"
91
+ )
92
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
93
+
94
+ print(f"[Expert {choice}] ", end="", flush=True)
95
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
96
+ _ = model.generate(
97
+ input_ids=input_ids,
98
+ max_new_tokens=max_new_tokens,
99
+ streamer=streamer
100
+ )
101
+ print("")
102
+ return None, choice
103
+
104
+
105
+ if __name__ == "__main__":
106
+ while True:
107
+ user_input = input("User: ")
108
+ _, expert_idx = generate_response(user_input)
109
+ ```