L
Initializing Studio...
Master Large Language Model training including pre-training, fine-tuning, and alignment techniques with modern optimization strategies.
1import langtrain2from langtrain.models import LlamaForCausalLM3from langtrain.data import PretrainingDataset4import torch56# Configure model architecture7config = langtrain.LlamaConfig(8 vocab_size=32000,9 hidden_size=4096,10 intermediate_size=11008,11 num_hidden_layers=32,12 num_attention_heads=32,13 max_position_embeddings=4096,14 rms_norm_eps=1e-6,15 rope_theta=10000.0,16 attention_dropout=0.0,17 hidden_dropout=0.018)1920# Initialize model with proper weight initialization21model = LlamaForCausalLM(config)22model.apply(lambda m: langtrain.init_weights(m, config))2324# Prepare pre-training dataset25dataset = PretrainingDataset(26 data_path="path/to/tokenized_data",27 seq_length=4096,28 tokenizer=tokenizer,29 pack_sequences=True, # Pack multiple documents30 shuffle_buffer_size=1000031)3233# Configure training with modern optimizations34training_args = langtrain.TrainingArguments(35 output_dir="./llama-7b-pretrain",36 per_device_train_batch_size=8,37 gradient_accumulation_steps=16, # Effective batch size: 8*16*num_gpus38 max_steps=100000,39 learning_rate=3e-4,40 weight_decay=0.1,41 warmup_steps=2000,42 lr_scheduler_type="cosine",43 bf16=True, # Use bfloat16 for numerical stability44 dataloader_num_workers=4,45 gradient_checkpointing=True,46 optim="adamw_torch_fused", # Fused AdamW for efficiency47 logging_steps=10,48 save_steps=5000,49 max_grad_norm=1.050)5152# Initialize trainer with distributed support53trainer = langtrain.Trainer(54 model=model,55 args=training_args,56 train_dataset=dataset,57 tokenizer=tokenizer58)5960# Start pre-training61trainer.train()
1import deepspeed2from langtrain.distributed import setup_distributed_training34# DeepSpeed ZeRO configuration5ds_config = {6 "train_batch_size": 512,7 "train_micro_batch_size_per_gpu": 4,8 "gradient_accumulation_steps": 32,910 "optimizer": {11 "type": "AdamW",12 "params": {13 "lr": 3e-4,14 "betas": [0.9, 0.95],15 "eps": 1e-8,16 "weight_decay": 0.117 }18 },1920 "scheduler": {21 "type": "WarmupDecayLR",22 "params": {23 "warmup_min_lr": 0,24 "warmup_max_lr": 3e-4,25 "warmup_num_steps": 2000,26 "total_num_steps": 10000027 }28 },2930 "zero_optimization": {31 "stage": 2, # ZeRO-2: shard gradients and optimizer states32 "offload_optimizer": {33 "device": "cpu", # Offload optimizer to CPU34 "pin_memory": True35 },36 "allgather_partitions": True,37 "reduce_scatter": True,38 "overlap_comm": True,39 "contiguous_gradients": True40 },4142 "fp16": {43 "enabled": True,44 "auto_cast": True,45 "loss_scale": 0,46 "initial_scale_power": 16,47 "loss_scale_window": 1000,48 "hysteresis": 2,49 "min_loss_scale": 150 },5152 "gradient_clipping": 1.0,53 "wall_clock_breakdown": False54}5556# Initialize distributed training57setup_distributed_training()5859# Initialize DeepSpeed engine60model_engine, optimizer, _, _ = deepspeed.initialize(61 model=model,62 config=ds_config,63 model_parameters=model.parameters()64)6566# Training loop with DeepSpeed67for step, batch in enumerate(dataloader):68 loss = model_engine(batch)69 model_engine.backward(loss)70 model_engine.step()7172 if step % 100 == 0:73 print(f"Step {step}, Loss: {loss.item():.4f}")7475 if step % 5000 == 0:76 model_engine.save_checkpoint("./checkpoints", step)7778# Train with real-time monitoring79for epoch in trainer.train_epochs():80 print(f"Epoch {epoch.number}: Loss={epoch.loss:.4f}")8182 # Adjust parameters if needed83 if epoch.loss > 0.5:84 trainer.adjust_learning_rate(0.8) # Reduce by 20%
1# Advanced training with custom configuration2training_config = langtrain.TrainingConfig(3 # Optimization4 optimizer="adamw",5 learning_rate=2e-5,6 weight_decay=0.01,78 # Scheduling9 lr_scheduler="cosine",10 warmup_steps=1000,1112 # Efficiency13 mixed_precision=True,14 gradient_checkpointing=True,15 dataloader_num_workers=8,1617 # Monitoring18 eval_steps=500,19 save_steps=1000,20 logging_steps=10021)2223trainer = langtrain.Trainer(24 model=model,25 config=training_config,26 callbacks=[27 langtrain.EarlyStoppingCallback(patience=3),28 langtrain.ModelCheckpointCallback(save_best=True),29 langtrain.WandbCallback(project="my-project")30 ]31)3233results = trainer.train()
1# Multi-GPU training2langtrain train \3 --config config.yaml \4 --distributed \5 --num-gpus 4 \6 --backend nccl78# Multi-node training9langtrain train \10 --config config.yaml \11 --distributed \12 --num-nodes 2 \13 --node-rank 0 \14 --master-addr "192.168.1.100" \15 --master-port 29500