🐛 Describe the bug
I was unable to build the docker image when running docker build -t colossalai ./docker with the following errors:
[+] Building 935.4s (6/9)
=> [internal] load build definition from Dockerfile 0.0s
=> => transferring dockerfile: 1.14kB 0.0s
=> [internal] load .dockerignore 0.0s
=> => transferring context: 2B 0.0s
=> [internal] load metadata for docker.io/hpcaitech/cuda-conda:11.3 0.5s
=> [1/6] FROM docker.io/hpcaitech/cuda-conda:11.3@sha256:8354717606e7be53824ff663ab3d4d0f99473f92896de00131d1e6a9a3bbd21d 0.0s
=> CACHED [2/6] RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch 0.0s
=> ERROR [3/6] RUN git clone https://github.com/NVIDIA/apex && cd apex && pip install packaging && pip install -v --disable-pip-version-check --no-cache-dir --glo 934.8s
------
#0 931.0 /opt/conda/lib/python3.9/site-packages/torch/include/torch/csrc/jit/ir/attributes.h:35:27: note: ‘torch::jit::toString’
#0 931.0 35 | static inline const char* toString(AttributeKind kind) {
#0 931.0 | ^~~~~~~~
#0 934.5 error: command '/usr/bin/gcc' failed with exit code 1
#0 934.7 Running setup.py install for apex: finished with status 'error'
#0 934.7 ERROR: Command errored out with exit status 1: /opt/conda/bin/python -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-req-build-hi5ozq1r/setup.py'"'"'; __file__='"'"'/tmp/pip-req-build-hi5ozq1r/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' --cpp_ext --cuda_ext --fast_layer_norm install --record /tmp/pip-record-5zinxil4/install-record.txt --single-version-externally-managed --compile --install-headers /opt/conda/include/python3.9/apex Check the logs for full command output.
------
Dockerfile:12
--------------------
11 | # install apex
12 | >>> RUN git clone https://github.com/NVIDIA/apex && \
13 | >>> cd apex && \
14 | >>> pip install packaging && \
15 | >>> pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
16 |
--------------------
ERROR: failed to solve: process "/bin/sh -c git clone https://github.com/NVIDIA/apex && cd apex && pip install packaging && pip install -v --disable-pip-version-check --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" --global-option=\"--fast_layer_norm\" ./" did not complete successfully: exit code: 1
This issue can be resolved by pinning the apex commit number.
After fixing this issue, I was still unable to import colossalai with error:
In [1]: import colossalai
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-4afa607b1f63> in <module>
----> 1 import colossalai
~/anaconda3/lib/python3.9/site-packages/colossalai/__init__.py in <module>
----> 1 from .initialize import (
2 get_default_parser,
3 initialize,
4 launch,
5 launch_from_openmpi,
~/anaconda3/lib/python3.9/site-packages/colossalai/initialize.py in <module>
16 from torch.utils.data import DataLoader
17
---> 18 from colossalai.amp import AMP_TYPE, convert_to_amp
19 from colossalai.amp.naive_amp import NaiveAMPModel
20 from colossalai.builder.builder import build_gradient_handler
~/anaconda3/lib/python3.9/site-packages/colossalai/amp/__init__.py in <module>
9
10 from .amp_type import AMP_TYPE
---> 11 from .apex_amp import convert_to_apex_amp
12 from .naive_amp import convert_to_naive_amp
13 from .torch_amp import convert_to_torch_amp
~/anaconda3/lib/python3.9/site-packages/colossalai/amp/apex_amp/__init__.py in <module>
2 from torch.optim import Optimizer
3
----> 4 from .apex_amp import ApexAMPOptimizer
5
6
~/anaconda3/lib/python3.9/site-packages/colossalai/amp/apex_amp/apex_amp.py in <module>
11 from torch import Tensor
12
---> 13 from colossalai.nn.optimizer import ColossalaiOptimizer
14 from colossalai.utils import clip_grad_norm_fp32
15
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/__init__.py in <module>
----> 1 from ._ops import *
2 from .layer import *
3 from .loss import *
4 from .lr_scheduler import *
5 from .metric import *
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/_ops/__init__.py in <module>
----> 1 from .addmm import colo_addmm
2 from .batch_norm import colo_batch_norm
3 from .element_wise import *
4 from .embedding import colo_embedding
5 from .embedding_bag import colo_embedding_bag
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/_ops/addmm.py in <module>
4 from colossalai.tensor.op_wrapper import colo_op_impl
5
----> 6 from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
7
8
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/_ops/_utils.py in <module>
5
6 from colossalai.global_variables import tensor_parallel_env as env
----> 7 from colossalai.nn.layer.utils import divide
8 from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
9
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/__init__.py in <module>
5 from .parallel_3d import *
6 from .parallel_sequence import *
----> 7 from .moe import *
8 from .utils import *
9 from .vanilla import *
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/moe/__init__.py in <module>
----> 1 from .checkpoint import load_moe_model, save_moe_model
2 from .experts import Experts, FFNExperts, TPExperts
3 from .layers import MoeLayer, MoeModule
4 from .routers import MoeRouter, Top1Router, Top2Router
5 from .utils import NormalNoiseGenerator, UniformNoiseGenerator, build_ffn_experts
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/moe/checkpoint.py in <module>
3 import torch.nn as nn
4
----> 5 from .experts import MoeExperts
6
7
~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/moe/experts.py in <module>
10 from colossalai.context.moe_context import MOE_CONTEXT
11 from colossalai.utils import get_current_device
---> 12 from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
13
14
~/anaconda3/lib/python3.9/site-packages/colossalai/zero/__init__.py in <module>
----> 1 from .gemini import (
2 ColoInitContext,
3 GeminiAdamOptimizer,
4 GeminiDDP,
5 ZeroDDP,
~/anaconda3/lib/python3.9/site-packages/colossalai/zero/gemini/__init__.py in <module>
1 from .chunk import ChunkManager, TensorInfo, TensorState, search_chunk_configuration
2 from .colo_init_context import ColoInitContext, post_process_colo_init_ctx
----> 3 from .gemini_ddp import GeminiDDP, ZeroDDP
4 from .gemini_mgr import GeminiManager
5 from .gemini_optimizer import GeminiAdamOptimizer, ZeroOptimizer
~/anaconda3/lib/python3.9/site-packages/colossalai/zero/gemini/gemini_ddp.py in <module>
17 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
18 from colossalai.utils import get_current_device, is_ddp_ignored
---> 19 from colossalai.utils.model.experimental import LazyTensor
20
21 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
~/anaconda3/lib/python3.9/site-packages/colossalai/utils/model/experimental.py in <module>
8 from torch.utils._pytree import tree_map
9
---> 10 from colossalai._analyzer._subclasses import MetaTensor
11 from colossalai.tensor.d_tensor.d_tensor import DTensor
12 from colossalai.tensor.d_tensor.layout import Layout
ModuleNotFoundError: No module named 'colossalai._analyzer'
I think this is due to missing init.py in colossalai/_analyzer.
Environment
colossalai: master
🐛 Describe the bug
I was unable to build the docker image when running
docker build -t colossalai ./dockerwith the following errors:This issue can be resolved by pinning the apex commit number.
After fixing this issue, I was still unable to import colossalai with error:
I think this is due to missing init.py in
colossalai/_analyzer.Environment
colossalai: master