🐛 Describe the bug
When I run the train code, it occurs:
Traceback (most recent call last):
File "/home/zrytest/colorSD2.0/ColossalAI-main/ColossalAI/examples/images/diffusion/main.py", line 804, in
trainer.fit(model, data)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 602, in fit
call._call_and_handle_interrupt(
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 644, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1097, in _run
results = self._run_stage()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1176, in _run_stage
self._run_train()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1199, in _run_train
Traceback (most recent call last):
File "/home/zrytest/colorSD2.0/ColossalAI-main/ColossalAI/examples/images/diffusion/main.py", line 804, in
self.fit_loop.run()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
self.on_advance_end()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1379, in _call_callback_hooks
trainer.fit(model, data)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 602, in fit
call._call_and_handle_interrupt(
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 644, in _fit_impl
fn(self, self.lightning_module, *args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
self._save_last_checkpoint(trainer, monitor_candidates)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 649, in _save_last_checkpoint
self._run(model, ckpt_path=self.ckpt_path)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1097, in _run
self._save_checkpoint(trainer, filepath)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
results = self._run_stage()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1176, in _run_stage
trainer.save_checkpoint(filepath, self.save_weights_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1924, in save_checkpoint
self._run_train()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1199, in _run_train
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 510, in save_checkpoint
self.fit_loop.run()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
_checkpoint = self.dump_checkpoint(weights_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 442, in dump_checkpoint
self.on_advance_end()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
"state_dict": self._get_lightning_module_state_dict(),
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 524, in _get_lightning_module_state_dict
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1379, in _call_callback_hooks
state_dict = self.trainer.strategy.lightning_module_state_dict()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/strategies/colossalai.py", line 428, in lightning_module_state_dict
org_dict = self.model.state_dict(only_rank_0=rank_zero_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/colossalai-0.2.0-py3.8.egg/colossalai/nn/parallel/data_parallel.py", line 352, in state_dict
fn(self, self.lightning_module, *args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/colossalai-0.2.0-py3.8.egg/colossalai/nn/parallel/utils.py", line 101, in get_static_torch_model
self._save_last_checkpoint(trainer, monitor_candidates)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 649, in _save_last_checkpoint
assert param in colo_to_torch, f"can not find parameter {full_param_name} in the GeminiDDP module"
AssertionError: can not find parameter _forward_module.first_stage_model.encoder.conv_in.weight in the GeminiDDP module
Environment
No response
🐛 Describe the bug
When I run the train code, it occurs:
Traceback (most recent call last):
File "/home/zrytest/colorSD2.0/ColossalAI-main/ColossalAI/examples/images/diffusion/main.py", line 804, in
trainer.fit(model, data)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 602, in fit
call._call_and_handle_interrupt(
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 644, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1097, in _run
results = self._run_stage()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1176, in _run_stage
self._run_train()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1199, in _run_train
Traceback (most recent call last):
File "/home/zrytest/colorSD2.0/ColossalAI-main/ColossalAI/examples/images/diffusion/main.py", line 804, in
self.fit_loop.run()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
self.on_advance_end()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1379, in _call_callback_hooks
trainer.fit(model, data)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 602, in fit
call._call_and_handle_interrupt(
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 644, in _fit_impl
fn(self, self.lightning_module, *args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
self._save_last_checkpoint(trainer, monitor_candidates)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 649, in _save_last_checkpoint
self._run(model, ckpt_path=self.ckpt_path)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1097, in _run
self._save_checkpoint(trainer, filepath)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 365, in _save_checkpoint
results = self._run_stage()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1176, in _run_stage
trainer.save_checkpoint(filepath, self.save_weights_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1924, in save_checkpoint
self._run_train()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1199, in _run_train
self._checkpoint_connector.save_checkpoint(filepath, weights_only=weights_only, storage_options=storage_options)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 510, in save_checkpoint
self.fit_loop.run()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/loop.py", line 200, in run
_checkpoint = self.dump_checkpoint(weights_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 442, in dump_checkpoint
self.on_advance_end()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 295, in on_advance_end
"state_dict": self._get_lightning_module_state_dict(),
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/checkpoint_connector.py", line 524, in _get_lightning_module_state_dict
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1379, in _call_callback_hooks
state_dict = self.trainer.strategy.lightning_module_state_dict()
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/strategies/colossalai.py", line 428, in lightning_module_state_dict
org_dict = self.model.state_dict(only_rank_0=rank_zero_only)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/colossalai-0.2.0-py3.8.egg/colossalai/nn/parallel/data_parallel.py", line 352, in state_dict
fn(self, self.lightning_module, *args, **kwargs)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 304, in on_train_epoch_end
torch_model = get_static_torch_model(zero_ddp_model=self, only_rank_0=only_rank_0)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/colossalai-0.2.0-py3.8.egg/colossalai/nn/parallel/utils.py", line 101, in get_static_torch_model
self._save_last_checkpoint(trainer, monitor_candidates)
File "/home/zrytest/anaconda3/envs/ldm_dx/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py", line 649, in _save_last_checkpoint
assert param in colo_to_torch, f"can not find parameter
{full_param_name}in the GeminiDDP module"AssertionError: can not find parameter
_forward_module.first_stage_model.encoder.conv_in.weightin the GeminiDDP moduleEnvironment
No response