Skip to content

[BUG]: can not find parameter {full_param_name} in the GeminiDDP module #2430

@FrankieDong

Description

@FrankieDong

🐛 Describe the bug

When I run the train code, it occurs:

Traceback (most recent call last):
File "/home/zrytest/colorSD2.0/ColossalAI-main/ColossalAI/examples/images/diffusion/main.py", line 804, in
trainer.fit(model, data)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 602, in fit
call._call_and_handle_interrupt(
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 644, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1097, in _run
results = self._run_stage()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1176, in _run_stage
self._run_train()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1199, in _run_train
Traceback (most recent call last):
File "/home/zrytest/colorSD2.0/ColossalAI-main/ColossalAI/examples/images/diffusion/main.py", line 804, in
self.fit_loop.run()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
self.on_advance_end()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1379, in _call_callback_hooks
trainer.fit(model, data)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 602, in fit
call._call_and_handle_interrupt(
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 644, in _fit_impl
fn(self, self.lightning_module, *args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
self._save_last_checkpoint(trainer, monitor_candidates)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 649, in _save_last_checkpoint
self._run(model, ckpt_path=self.ckpt_path)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1097, in _run
self._save_checkpoint(trainer, filepath)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
results = self._run_stage()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1176, in _run_stage
trainer.save_checkpoint(filepath, self.save_weights_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1924, in save_checkpoint
self._run_train()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1199, in _run_train
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 510, in save_checkpoint
self.fit_loop.run()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
_checkpoint = self.dump_checkpoint(weights_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 442, in dump_checkpoint
self.on_advance_end()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
"state_dict": self._get_lightning_module_state_dict(),
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 524, in _get_lightning_module_state_dict
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1379, in _call_callback_hooks
state_dict = self.trainer.strategy.lightning_module_state_dict()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/strategies/colossalai.py", line 428, in lightning_module_state_dict
org_dict = self.model.state_dict(only_rank_0=rank_zero_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/colossalai-0.2.0-py3.8.egg/colossalai/nn/parallel/data_parallel.py", line 352, in state_dict
fn(self, self.lightning_module, *args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/colossalai-0.2.0-py3.8.egg/colossalai/nn/parallel/utils.py", line 101, in get_static_torch_model
self._save_last_checkpoint(trainer, monitor_candidates)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 649, in _save_last_checkpoint
assert param in colo_to_torch, f"can not find parameter {full_param_name} in the GeminiDDP module"
AssertionError: can not find parameter _forward_module.first_stage_model.encoder.conv_in.weight in the GeminiDDP module

Environment

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions