import os from dataclasses import dataclass, asdict from ctransformers import AutoModelForCausalLM, AutoConfig @dataclass class GenerationConfig: temperature: float top_k: int top_p: float repetition_penalty: float max_new_tokens: int reset: bool stream: bool threads: int stop: list[str] def format_prompt(user_prompt: str): return f"""### Instruction: {user_prompt} ### Response:""" def generate(llm: AutoModelForCausalLM, generation_config: GenerationConfig, prompt: str): return llm(format_prompt(prompt), **asdict(generation_config)) def generate_code(prompt, config_path, model_name, max_tokens, temperature): path = os.path.abspath(f"{config_path}/{model_name}.bin") config = AutoConfig.from_pretrained( os.path.abspath(config_path), ) llm = AutoModelForCausalLM.from_pretrained( path, model_type="replit", config=config, ) generation_config = GenerationConfig( temperature=temperature, top_k=50, top_p=0.9, repetition_penalty=1.0, max_new_tokens=max_tokens, # adjust as needed reset=True, # reset history (cache) stream=True, # streaming per word/token threads=os.cpu_count(), # adjust for your CPU stop=["<|endoftext|>"], ) generator = generate(llm, generation_config, prompt) output = "" for word in generator: print(word) output += word return output