​问题根源​​权重文件是用 DistributedDataParallel包装的模型保存的键名带 module.前缀但加载时模型没有被 DDP 包装导致键名不匹配。​​解决方案​​​​保持 torchpack的分布式初始化方式​​使用 dist.init()而不是 PyTorch 原生的 init_process_group​​正确处理权重文件​​在加载权重时去掉 module.前缀​​关键代码修改​​在 model_zoo.py中添加了权重处理逻辑#去掉 module. 前缀new_state_dict {}for k, v in state_dict.items():if k.startswith(‘module.’):new_k k[7:] # 去掉 ‘module.’else:new_k knew_state_dict[new_k] v非严格模式加载model.load_state_dict(new_state_dict, strictFalse)解决方法如下解决方法 第一步修改test.pyimportargparseimportsys,osimporttorchimporttorch.backends.cudnnimporttorch.cudaimporttorch.nnimporttorch.utils.datafromtorchpackimportdistributedasdistfromtorchpack.callbacksimportCallbacks,SaverRestorefromtorchpack.environimportauto_set_run_dir,set_run_dirfromtorchpack.utils.configimportconfigsfromtorchpack.utils.loggingimportloggerfromtqdmimporttqdmfromcoreimportbuilderfromcore.callbacksimportMeanIoUfromcore.trainersimportSemanticKITTITrainerfrommodel_zooimportminkunet_test,spvcnn_testdefmain()-None:# 正确的分布式初始化dist.init()torch.backends.cudnn.benchmarkTruetorch.cuda.set_device(dist.local_rank())parserargparse.ArgumentParser()parser.add_argument(--run-dir,metavarDIR,helprun directory)parser.add_argument(--name,typestr,helpmodel name)parser.add_argument(--gpu,default0,helpgpu index)args,optsparser.parse_known_args()args.configargs.name/metainfo/configs.yamlconfigs.load(args.config,recursiveTrue)configs.update(opts)ifargs.run_dirisNone:args.run_dirauto_set_run_dir()else:set_run_dir(args.run_dir)os.environ[CUDA_VISIBLE_DEVICES]args.gpu logger.info( .join([sys.executable]sys.argv))logger.info(fExperiment started: {args.run_dir}.\nf{configs})datasetbuilder.make_dataset()dataflow{}forsplitindataset:samplertorch.utils.data.distributed.DistributedSampler(dataset[split],num_replicasdist.size(),rankdist.rank(),shuffle(splittrain))dataflow[split]torch.utils.data.DataLoader(dataset[split],batch_size1,samplersampler,num_workersconfigs.workers_per_gpu,pin_memoryTrue,collate_fndataset[split].collate_fn)assertos.path.exists(args.name/checkpoints/max-iou-test.pt)# 加载模型使用修复后的函数ifspvcnninargs.name.lower():modelspvcnn_test(weight_pathargs.name/checkpoints/max-iou-test.pt,configsconfigs)elifminkinargs.name.lower():modelminkunet_test(weight_pathargs.name/checkpoints/max-iou-test.pt,configsconfigs)else:raiseNotImplementedError# 使用 DDP 包装模型modeltorch.nn.parallel.DistributedDataParallel(model.cuda(),device_ids[dist.local_rank()],find_unused_parametersTrue)model.eval()criterionbuilder.make_criterion()optimizerbuilder.make_optimizer(model)schedulerbuilder.make_scheduler(optimizer)trainerSemanticKITTITrainer(modelmodel,criterioncriterion,optimizeroptimizer,schedulerscheduler,num_workersconfigs.workers_per_gpu,seedconfigs.train.seed)callbacksCallbacks([SaverRestore(),MeanIoU(configs.data.num_classes,configs.data.ignore_label)])callbacks._set_trainer(trainer)trainer.callbackscallbacks trainer.dataflowdataflow[test]trainer.before_train()trainer.before_epoch()model.eval()forfeed_dictintqdm(dataflow[test],desceval):_inputs{}forkey,valueinfeed_dict.items():ifnamenotinkey:_inputs[key]value.cuda()inputs_inputs[lidar]outputsmodel(inputs)invsfeed_dict[inverse_map]all_labelsfeed_dict[targets_mapped]_outputs[]_targets[]foridxinrange(invs.C[:,-1].max()1):cur_scene_pts(inputs.C[:,-1]idx).cpu().numpy()cur_invinvs.F[invs.C[:,-1]idx].cpu().numpy()cur_label(all_labels.C[:,-1]idx).cpu().numpy()outputs_mappedoutputs[cur_scene_pts][cur_inv].argmax(1)targets_mappedall_labels.F[cur_label]_outputs.append(outputs_mapped)_targets.append(targets_mapped)outputstorch.cat(_outputs,0)targetstorch.cat(_targets,0)output_dict{outputs:outputs,targets:targets}trainer.after_step(output_dict)trainer.after_epoch()if__name____main__:main()第二步修改model_zoo.pydefspvcnn_test(weight_path,configs):modelSPVCNN(num_classesconfigs.data.num_classes,crconfigs.model.cr,presconfigs.dataset.voxel_size,vresconfigs.dataset.voxel_size)checkpointtorch.load(weight_path,map_locationcpu)# 精确提取模型权重ifmodelincheckpoint:state_dictcheckpoint[model]else:state_dictcheckpoint# 只加载模型相关的键model_state_dict{}fork,vinstate_dict.items():ifk.startswith(module.):kk[7:]# 去掉 module. 前缀# 只保留模型权重过滤掉训练状态ifnotany(xinkforxin[optimizer,scheduler,scaler,epoch,step]):model_state_dict[k]v model.load_state_dict(model_state_dict,strictFalse)returnmodel