L
Initializing Studio...
Parameter-efficient fine-tuning with Low-Rank Adaptation and Quantized LoRA.
1# Basic LoRA implementation concept2import torch3import torch.nn as nn45class LoRALinear(nn.Module):6 def __init__(self, in_features, out_features, rank=4, alpha=1):7 super().__init__()8 self.rank = rank9 self.alpha = alpha1011 # Frozen pre-trained weights12 self.weight = nn.Parameter(torch.randn(out_features, in_features))13 self.weight.requires_grad = False1415 # LoRA matrices16 self.lora_A = nn.Parameter(torch.randn(rank, in_features))17 self.lora_B = nn.Parameter(torch.zeros(out_features, rank))1819 # Initialize A with gaussian, B with zeros20 nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))21 nn.init.zeros_(self.lora_B)2223 def forward(self, x):24 # Original forward pass + LoRA adaptation25 result = F.linear(x, self.weight)26 lora_result = F.linear(F.linear(x, self.lora_A.T), self.lora_B.T)27 return result + (self.alpha / self.rank) * lora_result
1# QLoRA fine-tuning with LangTrain2from langtrain import QLoRATrainer3from langtrain.models import AutoModelForCausalLM4from langtrain.datasets import load_dataset5from transformers import AutoTokenizer67# Load model with 4-bit quantization8model = AutoModelForCausalLM.from_pretrained(9 "meta-llama/Llama-3.1-8B",10 load_in_4bit=True,11 bnb_4bit_compute_dtype=torch.float16,12 bnb_4bit_quant_type="nf4",13 bnb_4bit_use_double_quant=True,14)1516tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")1718# Configure QLoRA parameters19qlora_config = {20 "r": 64, # Rank21 "lora_alpha": 16, # Scaling parameter22 "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],23 "lora_dropout": 0.1,24 "bias": "none",25 "task_type": "CAUSAL_LM"26}2728# Load and prepare dataset29dataset = load_dataset("your_dataset.jsonl")30dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding=True))3132# Initialize trainer33trainer = QLoRATrainer(34 model=model,35 tokenizer=tokenizer,36 dataset=dataset,37 qlora_config=qlora_config,38 output_dir="./qlora_results",39 num_train_epochs=3,40 per_device_train_batch_size=4,41 gradient_accumulation_steps=4,42 learning_rate=2e-4,43 fp16=True,44 save_steps=500,45 logging_steps=10,46)4748# Start training49trainer.train()
1# Advanced LoRA configuration2advanced_config = {3 # Core LoRA parameters4 "r": 32, # Rank - balance between efficiency and capacity5 "lora_alpha": 64, # Scaling factor (typically 2*r)6 "lora_dropout": 0.05, # Regularization78 # Target modules - customize based on model architecture9 "target_modules": [10 "q_proj", "k_proj", "v_proj", "o_proj", # Attention11 "gate_proj", "up_proj", "down_proj" # MLP (for Llama-like models)12 ],1314 # Advanced options15 "bias": "lora_only", # Train bias in LoRA layers16 "modules_to_save": ["embed_tokens", "lm_head"], # Additional modules17 "init_lora_weights": True, # Proper initialization1819 # QLoRA specific20 "load_in_4bit": True,21 "bnb_4bit_compute_dtype": torch.bfloat16,22 "bnb_4bit_quant_type": "nf4",23 "bnb_4bit_use_double_quant": True,24}2526# Training hyperparameters27training_args = {28 "output_dir": "./advanced_lora_results",29 "num_train_epochs": 5,30 "per_device_train_batch_size": 2,31 "gradient_accumulation_steps": 8,32 "learning_rate": 1e-4,33 "weight_decay": 0.01,34 "warmup_ratio": 0.03,35 "lr_scheduler_type": "cosine",36 "save_strategy": "steps",37 "save_steps": 250,38 "eval_strategy": "steps",39 "eval_steps": 250,40 "logging_steps": 10,41 "fp16": False,42 "bf16": True, # Better numerical stability43 "dataloader_pin_memory": False, # Memory optimization44 "remove_unused_columns": False,45}
1# Method 1: Merge LoRA adapters into base model2from peft import PeftModel3import torch45# Load base model and adapter6base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")7model = PeftModel.from_pretrained(base_model, "./lora_results")89# Merge adapters10merged_model = model.merge_and_unload()1112# Save merged model13merged_model.save_pretrained("./merged_model")14tokenizer.save_pretrained("./merged_model")1516# Method 2: Deploy with separate adapters17from langtrain import LoRAInference1819# Initialize inference engine20inference = LoRAInference(21 base_model="meta-llama/Llama-3.1-8B",22 adapter_path="./lora_results",23 device="cuda",24 torch_dtype=torch.float1625)2627# Switch between different adapters dynamically28inference.load_adapter("task_1", "./task1_lora")29inference.load_adapter("task_2", "./task2_lora")3031# Generate with specific adapter32response = inference.generate(33 "Hello, how are you?",34 adapter_name="task_1",35 max_length=100,36 temperature=0.737)3839print(response)
gradient_checkpointing=True for 40-50% memory reduction.bf16 instead of fp16 for numerical stability. Increase batch size with gradient accumulation for better GPU utilization.1# Production-optimized training configuration2from langtrain import OptimizedQLoRATrainer3import torch45# Memory-efficient configuration6optimizer_config = {7 # Optimizer settings8 "optimizer": "adamw_torch_fused", # Faster fused optimizer9 "learning_rate": 2e-4,10 "weight_decay": 0.01,11 "adam_beta1": 0.9,12 "adam_beta2": 0.999,13 "adam_epsilon": 1e-8,1415 # Memory optimizations16 "gradient_checkpointing": True,17 "dataloader_pin_memory": False,18 "dataloader_num_workers": 4,19 "remove_unused_columns": False,2021 # Performance optimizations22 "bf16": True, # Better than fp16 for stability23 "tf32": True, # Enable TensorFloat-32 on A10024 "ddp_find_unused_parameters": False,2526 # Batch size optimization27 "per_device_train_batch_size": 1,28 "gradient_accumulation_steps": 16, # Effective batch size = 1629 "max_grad_norm": 1.0,30}3132# Initialize optimized trainer33trainer = OptimizedQLoRATrainer(34 model=model,35 tokenizer=tokenizer,36 dataset=dataset,37 **optimizer_config38)3940# Train with automatic mixed precision41with torch.cuda.amp.autocast():42 trainer.train()4344# Profile memory usage45print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")