Skip to content

parallel error #48

@mahao18cm

Description

@mahao18cm

Traceback (most recent call last):
File "/root/FlowFormer-Official/train_FlowFormer.py", line 169, in
train(cfg)
File "/root/FlowFormer-Official/train_FlowFormer.py", line 89, in train
flow_predictions = model(image1, image2, output)
File "/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 186, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File "/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 201, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 108, in parallel_apply
output.reraise()
File "/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/_utils.py", line 705, in reraise
raise exception
TypeError: Caught TypeError in replica 1 on device 1.
I am not sure whether my cuddn error.Because my cuddn report error too.
/root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/nn/modules/conv.py:456: UserWarning: Plan failed with a CuDNNError: cuDNN error: CUDNN_STATUS_BAD_PARAM
Exception raised from run_conv_plan at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:374 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f626257a897 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: + 0xe1c861 (0x7f62160ed861 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: + 0x1095d83 (0x7f6216366d83 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: + 0x1097c2c (0x7f6216368c2c in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: + 0x109817b (0x7f621636917b in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: + 0x107aca2 (0x7f621634bca2 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: at::native::cudnn_convolution(at::Tensor const&, at::Tensor const&, c10::ArrayRef, c10::ArrayRef, c10::ArrayRef, long, bool, bool, bool) + 0x53f (0x7f621634c66f in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #7: + 0x32d0a9e (0x7f62185a1a9e in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #8: + 0x32e8251 (0x7f62185b9251 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #9: at::_ops::cudnn_convolution::call(at::Tensor const&, at::Tensor const&, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::SymInt, bool, bool, bool) + 0x2bb (0x7f624bbb8c2b in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #10: at::native::_convolution(at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRef, c10::ArrayRef, c10::ArrayRef, bool, c10::ArrayRef, long, bool, bool, bool, bool) + 0x13cb (0x7f624adf380b in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #11: + 0x2e0089f (0x7f624bf8189f in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #12: + 0x2e071fc (0x7f624bf881fc in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #13: at::_ops::_convolution::call(at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, bool, c10::ArrayRefc10::SymInt, c10::SymInt, bool, bool, bool, bool) + 0x344 (0x7f624b6ca6f4 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #14: at::native::convolution(at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRef, c10::ArrayRef, c10::ArrayRef, bool, c10::ArrayRef, long) + 0x3b8 (0x7f624ade6e88 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #15: + 0x2e0013c (0x7f624bf8113c in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #16: + 0x2e07068 (0x7f624bf88068 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #17: at::_ops::convolution::redispatch(c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, bool, c10::ArrayRefc10::SymInt, c10::SymInt) + 0x17b (0x7f624b68838b in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #18: + 0x4503901 (0x7f624d684901 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #19: + 0x4504879 (0x7f624d685879 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #20: at::_ops::convolution::call(at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, bool, c10::ArrayRefc10::SymInt, c10::SymInt) + 0x2d4 (0x7f624b6c94f4 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #21: + 0x19bd900 (0x7f624ab3e900 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #22: at::native::conv2d_symint(at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::SymInt) + 0x16b (0x7f624adea76b in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #23: + 0x2ff96c3 (0x7f624c17a6c3 in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #24: + 0x2ff995d (0x7f624c17a95d in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #25: at::_ops::conv2d::call(at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, c10::SymInt) + 0x26e (0x7f624bced95e in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #26: + 0x6853ad (0x7f62611803ad in /root/miniconda3/envs/flownet2/lib/python3.10/site-packages/torch/lib/libtorch_python.so)
frame #27: python() [0x4fdc87]

frame #30: python() [0x5099ce]
frame #32: python() [0x509b26]
frame #34: python() [0x509b26]
frame #38: python() [0x5cf883]
frame #41: python() [0x509b26]
frame #43: python() [0x509b26]
frame #47: python() [0x5cf883]
frame #50: python() [0x509b26]
frame #52: python() [0x509b26]
frame #56: python() [0x5cf883]
frame #59: python() [0x509b26]
frame #61: python() [0x509b26]
(Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:921.)
return F.conv2d(input, weight, bias, self.stride,

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions