import os from pathlib import Path from dataclasses import dataclass, asdict from ctransformers import AutoModelForCausalLM, AutoConfig @dataclass class GenerationConfig: temperature: float top_k: int top_p: float repetition_penalty: float max_new_tokens: int reset: bool stream: bool threads: int stop: list[str] def format_prompt(user_prompt: str): return f"""### Instruction: {user_prompt} ### Response:""" def generate(llm: AutoModelForCausalLM, generation_config: GenerationConfig, prompt: str): return llm(format_prompt(prompt), **asdict(generation_config)) def generate_code(prompt, model_name, max_tokens, temperature): from_local = False model_path = model_name config_path = model_name if from_local: config_folder = model_name.split("/")[0] config_path = os.path.abspath(f"models/{config_folder}") model_path = os.path.abspath(f"models/{model_name}.bin") config = AutoConfig.from_pretrained( config_path, ) llm = AutoModelForCausalLM.from_pretrained( model_path, model_type="replit", config=config, ) generation_config = GenerationConfig( temperature=temperature, top_k=50, top_p=0.9, repetition_penalty=1.0, max_new_tokens=max_tokens, # adjust as needed reset=True, # reset history (cache) stream=True, # streaming per word/token threads=os.cpu_count(), # adjust for your CPU stop=["<|endoftext|>"], ) generator = generate(llm, generation_config, prompt) output = "" for word in generator: print(word) output += word return output