LLM 训练:从预训练到微调
LLM 训练从预训练到微调1. 技术分析1.1 LLM 训练流程LLM 训练分为预训练和微调两个阶段训练流程 预训练: 大规模无监督训练 微调: 特定任务训练 RLHF: 人类反馈强化学习1.2 预训练 vs 微调阶段数据目标方法预训练大规模文本语言建模自监督学习微调任务数据特定任务监督学习RLHF人类反馈对齐人类偏好强化学习1.3 训练策略训练策略 预训练: 下一个 token 预测 微调: 监督微调(SFT) RLHF: 奖励模型 PPO2. 核心功能实现2.1 预训练数据处理import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader class TextDataset(Dataset): def __init__(self, text, tokenizer, max_len512): self.text text self.tokenizer tokenizer self.max_len max_len def __len__(self): return len(self.text) def __getitem__(self, idx): text self.text[idx] encoding self.tokenizer( text, truncationTrue, max_lengthself.max_len, paddingmax_length, return_tensorspt ) input_ids encoding[input_ids].flatten() labels input_ids.clone() return {input_ids: input_ids, labels: labels} class DataCollator: def __call__(self, batch): input_ids torch.stack([item[input_ids] for item in batch]) labels torch.stack([item[labels] for item in batch]) return {input_ids: input_ids, labels: labels} class TextDataLoader: def __init__(self, texts, tokenizer, batch_size32, max_len512): self.dataset TextDataset(texts, tokenizer, max_len) self.collator DataCollator() self.dataloader DataLoader( self.dataset, batch_sizebatch_size, collate_fnself.collator, shuffleTrue ) def __iter__(self): return iter(self.dataloader) def __len__(self): return len(self.dataloader)2.2 预训练训练器class PretrainingTrainer: def __init__(self, model, optimizer, scheduler, loss_fn, devicecuda): self.model model.to(device) self.optimizer optimizer self.scheduler scheduler self.loss_fn loss_fn self.device device def train_step(self, batch): self.model.train() self.optimizer.zero_grad() input_ids batch[input_ids].to(self.device) labels batch[labels].to(self.device) outputs self.model(input_ids, labelslabels) loss outputs.loss loss.backward() self.optimizer.step() self.scheduler.step() return loss.item() def train_epoch(self, dataloader): total_loss 0 for batch in dataloader: loss self.train_step(batch) total_loss loss return total_loss / len(dataloader) class DistributedPretrainer: def __init__(self, model, config): self.model torch.nn.parallel.DistributedDataParallel(model) self.config config def train(self, dataloader): optimizer torch.optim.Adam(self.model.parameters(), lrself.config[lr]) scheduler torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, self.config[epochs]) for epoch in range(self.config[epochs]): for batch in dataloader: optimizer.zero_grad() input_ids batch[input_ids].to(cuda) labels batch[labels].to(cuda) outputs self.model(input_ids, labelslabels) loss outputs.loss loss.backward() optimizer.step() scheduler.step()2.3 微调与 RLHFclass SFTTrainer: def __init__(self, model, tokenizer, config): self.model model self.tokenizer tokenizer self.config config def train(self, instruction_response_pairs): optimizer torch.optim.Adam(self.model.parameters(), lrself.config[lr]) for epoch in range(self.config[epochs]): for instruction, response in instruction_response_pairs: optimizer.zero_grad() prompt fInstruction: {instruction}\nResponse: {response} encoding self.tokenizer(prompt, return_tensorspt) outputs self.model(**encoding, labelsencoding[input_ids]) loss outputs.loss loss.backward() optimizer.step() class RewardModel(nn.Module): def __init__(self, base_model): super().__init__() self.base_model base_model self.reward_head nn.Linear(base_model.config.hidden_size, 1) def forward(self, input_ids): outputs self.base_model(input_ids) hidden_states outputs.last_hidden_state[:, -1, :] reward self.reward_head(hidden_states) return reward class PPO_trainer: def __init__(self, model, reward_model, config): self.model model self.reward_model reward_model self.config config self.optimizer torch.optim.Adam(self.model.parameters(), lrself.config[lr]) def train_step(self, prompt, response): self.optimizer.zero_grad() prompt_ids self.tokenizer.encode(prompt, return_tensorspt) response_ids self.tokenizer.encode(response, return_tensorspt) input_ids torch.cat([prompt_ids, response_ids], dim1) reward self.reward_model(input_ids) log_probs self._compute_log_probs(input_ids) loss -reward * log_probs loss.backward() self.optimizer.step() def _compute_log_probs(self, input_ids): outputs self.model(input_ids) logits outputs.logits log_probs F.log_softmax(logits, dim-1) log_probs log_probs.gather(2, input_ids.unsqueeze(2)).squeeze(2) return log_probs.sum(dim1)3. 性能对比3.1 训练阶段对比阶段数据量计算量目标预训练大规模极高通用能力SFT中等高任务能力RLHF小规模中对齐3.2 训练效率对比策略样本效率计算效率效果预训练低高通用SFT高中任务RLHF中低对齐3.3 模型大小影响参数规模预训练时间微调时间推理速度1B1周1天1000 tokens/s10B1月1周500 tokens/s100B6月1月100 tokens/s4. 最佳实践4.1 训练流程def build_training_pipeline(config): if config[stage] pretrain: return PretrainingTrainer(config) elif config[stage] sft: return SFTTrainer(config) elif config[stage] rlhf: return PPO_trainer(config) class LLMTrainingWorkflow: def __init__(self, config): self.config config def run(self): if self.config[stage] pretrain: self._run_pretraining() elif self.config[stage] sft: self._run_sft() elif self.config[stage] rlhf: self._run_rlhf() def _run_pretraining(self): model self._initialize_model() dataloader self._create_dataloader() trainer PretrainingTrainer(model, self.config) trainer.train(dataloader) def _run_sft(self): model self._load_pretrained_model() data self._load_sft_data() trainer SFTTrainer(model, data, self.config) trainer.train() def _run_rlhf(self): model self._load_sft_model() reward_model self._train_reward_model() trainer PPO_trainer(model, reward_model, self.config) trainer.train()4.2 训练优化class TrainingOptimizer: def __init__(self, model): self.model model def enable_mixed_precision(self): self.scaler torch.cuda.amp.GradScaler() def enable_gradient_checkpointing(self): self.model.gradient_checkpointing_enable() def enable_distributed_training(self): self.model torch.nn.parallel.DistributedDataParallel(self.model) def apply_all(self): self.enable_mixed_precision() self.enable_gradient_checkpointing()5. 总结LLM 训练是复杂的工程任务预训练大规模无监督学习微调监督学习适配特定任务RLHF强化学习对齐人类偏好训练优化混合精度、分布式训练对比数据如下RLHF 显著提升模型对齐能力混合精度训练可节省 50% 内存梯度检查点可节省 30-50% 内存推荐使用 LoRA 进行高效微调