L
Initializing Studio...
Complete guide to full parameter fine-tuning for maximum model customization and performance.
1# Full fine-tuning configuration2config = {3 "method": "full",4 "model": "llama-2-7b",5 "learning_rate": 1e-5,6 "batch_size": 8,7 "gradient_accumulation_steps": 4,8 "epochs": 3,9 "warmup_steps": 500,10 "weight_decay": 0.01,11 "optimizer": "adamw",12 "scheduler": "cosine"13}
1# Example training data format2{3 "instruction": "Summarize the following text:",4 "input": "Large language models have shown remarkable capabilities...",5 "output": "LLMs demonstrate strong performance across many NLP tasks."6}78# Upload dataset9dataset = client.datasets.upload(10 file_path="full_training_data.jsonl",11 name="full-finetune-dataset",12 validation_split=0.113)
1# Start full fine-tuning job2job = client.fine_tune.create(3 model="mistral-7b",4 dataset=dataset.id,5 config={6 "method": "full",7 "learning_rate": 5e-6,8 "batch_size": 4,9 "epochs": 2,10 "gradient_checkpointing": True,11 "fp16": True,12 "deepspeed_stage": 2,13 "save_steps": 500,14 "logging_steps": 100,15 "evaluation_strategy": "steps",16 "eval_steps": 50017 }18)1920print(f"Full fine-tuning job started: {job.id}")
1# Distributed training configuration2distributed_config = {3 "method": "full",4 "distributed": {5 "strategy": "deepspeed",6 "stage": 3, # ZeRO stage 3 for maximum memory efficiency7 "gradient_clipping": 1.0,8 "allgather_bucket_size": 2e8,9 "reduce_bucket_size": 2e810 },11 "hardware": {12 "gpu_count": 8,13 "instance_type": "gpu-large",14 "gradient_accumulation_steps": 1615 }16}1718# Launch distributed training19job = client.fine_tune.create(20 model="llama-2-13b",21 dataset=dataset.id,22 config=distributed_config23)
1# Monitor training progress2while job.status in ["queued", "running"]:3 job = client.fine_tune.get(job.id)45 if job.metrics:6 print(f"Step: {job.metrics.step}")7 print(f"Training Loss: {job.metrics.train_loss:.4f}")8 print(f"Validation Loss: {job.metrics.eval_loss:.4f}")9 print(f"Learning Rate: {job.metrics.learning_rate:.2e}")1011 time.sleep(60)1213print(f"Training completed with status: {job.status}")
1# Best practices configuration2best_practices_config = {3 "method": "full",4 "learning_rate": 2e-6, # Conservative learning rate5 "weight_decay": 0.01, # L2 regularization6 "dropout": 0.1, # Dropout for regularization7 "gradient_clipping": 1.0, # Prevent gradient explosion8 "early_stopping": {9 "patience": 3,10 "metric": "eval_loss",11 "min_delta": 0.00112 },13 "save_strategy": "epoch",14 "load_best_model_at_end": True15}