# Importar librerías necesarias from unsloth import FastLanguageModel import torch from dotenv import load_dotenv import os import gc # Cargar variables de entorno load_dotenv() token = os.getenv("HF_TOKEN") # Configuración de parámetros max_seq_length = 2048 dtype = None # None para detección automática. Float16 para Tesla T4, V100, Bfloat16 para Ampere+ load_in_4bit = True # Utilizar cuantización de 4 bits para reducir el uso de memoria load_in_1bit = True # Utilizar cuantización de 1 bit para una mayor optimización de la memoria optimize_storage = True # Optimizar el almacenamiento para minimizar el uso del disco optimize_ram = True # Optimizar el uso de RAM descargando pesos no utilizados optimize_model_space = True # Optimizar el espacio del modelo eliminando elementos inservibles # Lista de modelos pre-cuantizados en 4bit y 1bit quantized_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", "unsloth/gemma-7b-bnb-4bit", "unsloth/gemma-7b-it-bnb-4bit", "unsloth/gemma-2b-bnb-4bit", "unsloth/gemma-2b-it-bnb-4bit", "unsloth/gemma-7b-bnb-1bit", # Modelo cuantizado en 1 bit "unsloth/gemma-2b-bnb-1bit", # Modelo cuantizado en 1 bit ] # Cargar el modelo y el tokenizador model, tokenizer = FastLanguageModel.from_pretrained( model_name="unsloth/gemma-7b-bnb-1bit", max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, load_in_1bit=load_in_1bit, optimize_storage=optimize_storage, optimize_ram=optimize_ram, optimize_model_space=optimize_model_space, # Activar optimización de espacio del modelo token=token, ) # Agregar adaptadores LoRA model = FastLanguageModel.get_peft_model( model, r=16, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=16, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=3407, use_rslora=False, loftq_config=None, optimize_1bit=True, # Habilitar optimización de 1 bit ) # Optimización de almacenamiento, RAM y espacio del modelo if optimize_storage or optimize_ram or optimize_model_space: torch.cuda.empty_cache() gc.collect() # Eliminar componentes inservibles del modelo para optimizar el espacio def prune_model(model): layers_to_keep = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] for name, module in model.named_modules(): if name not in layers_to_keep: delattr(model, name) return model if optimize_model_space: model = prune_model(model) if optimize_storage: model.save_pretrained("optimized_model", max_shard_size="100MB") if optimize_ram: model.to_disk("optimized_model", device_map="cpu") # Preparación de datos from datasets import load_dataset alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" EOS_TOKEN = tokenizer.eos_token def formatting_prompts_func(examples): instructions = examples["instruction"] inputs = examples["input"] outputs = examples["output"] texts = [] for instruction, input, output in zip(instructions, inputs, outputs): text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN texts.append(text) return {"text": texts} dataset = load_dataset("yahma/alpaca-cleaned", split="train") dataset = dataset.map(formatting_prompts_func, batched=True) # Entrenamiento del modelo from trl import SFTTrainer from transformers import TrainingArguments from unsloth import is_bfloat16_supported trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=max_seq_length, dataset_num_proc=20, packing=False, args=TrainingArguments( per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=5, max_steps=60, learning_rate=8e-4, fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), logging_steps=1, optim="adamw_8bit", weight_decay=0.01, lr_scheduler_type="linear", seed=3407, output_dir="outputs", ), ) # Mostrar estadísticas de memoria actuales gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.") # Entrenar el modelo trainer_stats = trainer.train() # Mostrar estadísticas finales de memoria y tiempo used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) used_memory_for_lora = round(used_memory - start_gpu_memory, 3) used_percentage = round(used_memory / max_memory * 100, 3) lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.") print(f"Peak reserved memory = {used_memory} GB.") print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") print(f"Peak reserved memory % of max memory = {used_percentage} %.") print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") # Inferencia FastLanguageModel.for_inference(model) inputs = tokenizer( [ alpaca_prompt.format( "Continue the fibonacci sequence.", "1, 1, 2, 3, 5, 8", "", ) ], return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) print(tokenizer.batch_decode(outputs)) # Inferencia continua usando TextStreamer from transformers import TextStreamer text_streamer = TextStreamer(tokenizer) inputs = tokenizer( [ alpaca_prompt.format( "Continue the fibonacci sequence.", "1, 1, 2, 3, 5, 8", "", ) ], return_tensors="pt").to("cuda") _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) # Guardar y cargar modelos fine-tuned model.save_pretrained("lora_model") tokenizer.save_pretrained("lora_model") if True: model, tokenizer = FastLanguageModel.from_pretrained( model_name="lora_model", max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, load_in_1bit=load_in_1bit, optimize_storage=optimize_storage, optimize_ram=optimize_ram, optimize_model_space=optimize_model_space, # Activar optimización de espacio del modelo ) FastLanguageModel.for_inference(model) inputs = tokenizer( [ alpaca_prompt.format( "What is a famous tall tower in Paris?", "", "", ) ], return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) print(tokenizer.batch_decode(outputs)) # Guardar en float16 para VLLM if True: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit",) if True: model.push_to_hub_merged("Yjhhh/model", tokenizer, save_method="merged_16bit", token=token) # Guardar en formato GGUF if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_0") if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_0", token=token) if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_1") if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_1", token=token) if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8") if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8", token=token) if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_0") if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_0", token=token) if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_1") if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_1", token=token)