Kiwi 1 MoE
Collection
Kiwi 1 MoE is a 3x4B LLM with no-dedicated router, instead using its general expert as a router.
•
3 items
•
Updated
•
1
This is the code expert from the Kiwi 1 3x4b MoE. Using this model for non-programming problems will result in suboptimal results.
Kiwi 1 MoE is comprised of three experts, a math, coding and general expert resulitng in a 3x4b model resulting in a 12B model when ran all together. Experts are uploaded seperately so they can be used independently of the system.
Each expert is based off Qwen3 4B, finetuned for their task and basic generalization for better problem solving skills.
You can run Kiwi 1 MoE like so:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
# Paths to checkpoints
GATING_CKPT = r"LucidityAI/Kiwi-4b-General"
EXPERT_CKPTS = {
1: r"LucidityAI/Kiwi-4b-General", # General expert
2: r"LucidityAI/Kiwi-4b-Math", # Math expert
3: r"LucidityAI/Kiwi-4b-Coder" # Code expert
}
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
bnb_config = BitsAndBytesConfig(
# You can add any needed quants here.
)
tokenizer = AutoTokenizer.from_pretrained(GATING_CKPT, trust_remote_code=True)
# Load models in 8bit
def load_8bit_model(ckpt_path):
return AutoModelForCausalLM.from_pretrained(
ckpt_path, quantization_config=bnb_config, trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.float16
)
model_gate = load_8bit_model(GATING_CKPT)
models_expert = {
1: model_gate, # Gating model / General expert
2: load_8bit_model(EXPERT_CKPTS[2]), # Math expert
3: load_8bit_model(EXPERT_CKPTS[3]) # Code expert
}
def route_input(text: str) -> int:
prompt = (
"<|im_start|>system\n"
"You are a router. Based on the following input, respond with ONLY one digit, nothing else:\n"
"1 for general questions,\n"
"2 for math related questions,\n"
"3 for coding related questions.\n"
"Any task that involves programming at all should be allocated to the code expert, same for math. For questions that use both topics, select based on the topic thats most used.\n"
"<|im_end|>\n"
"<|im_start|>user\n"
f"{text} /no_think\n"
"<|im_end|>\n"
"<|im_start|>assistant\n"
"<think>\n</think>\n"
)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
output = model_gate.generate(input_ids=input_ids, max_new_tokens=15)
response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
# filter out non-digits
response = "".join([c for c in response if c.isdigit()])
try:
choice = int(response.strip()[0])
except Exception:
choice = 1
if choice not in EXPERT_CKPTS:
choice = 1
return choice
def generate_response(text: str, max_new_tokens: int = 200) -> (str, int):
choice = route_input(text)
model = models_expert[choice]
prompt = (
"<|im_start|>user\n"
f"{text}\n"
"<|im_end|>\n"
"<|im_start|>assistant\n"
"<think>"
)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
print(f"[Expert {choice}] ", end="", flush=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
_ = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
streamer=streamer
)
print("")
return None, choice
if __name__ == "__main__":
while True:
user_input = input("User: ")
_, expert_idx = generate_response(user_input)