Skip to content

[BUG]: Docker image build failure due to apex version mismatch and missing __init__.py in colossalai/_analyzer #3690

@ymwangg

Description

@ymwangg

🐛 Describe the bug

I was unable to build the docker image when running docker build -t colossalai ./docker with the following errors:

[+] Building 935.4s (6/9)                                                                                                                                                               
 => [internal] load build definition from Dockerfile                                                                                                                               0.0s
 => => transferring dockerfile: 1.14kB                                                                                                                                             0.0s
 => [internal] load .dockerignore                                                                                                                                                  0.0s
 => => transferring context: 2B                                                                                                                                                    0.0s
 => [internal] load metadata for docker.io/hpcaitech/cuda-conda:11.3                                                                                                               0.5s
 => [1/6] FROM docker.io/hpcaitech/cuda-conda:11.3@sha256:8354717606e7be53824ff663ab3d4d0f99473f92896de00131d1e6a9a3bbd21d                                                         0.0s
 => CACHED [2/6] RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch                                                              0.0s
 => ERROR [3/6] RUN git clone https://github.com/NVIDIA/apex &&     cd apex &&     pip install packaging &&     pip install -v --disable-pip-version-check --no-cache-dir --glo  934.8s
------  
#0 931.0     /opt/conda/lib/python3.9/site-packages/torch/include/torch/csrc/jit/ir/attributes.h:35:27: note:   ‘torch::jit::toString’
#0 931.0        35 | static inline const char* toString(AttributeKind kind) {
#0 931.0           |                           ^~~~~~~~
#0 934.5     error: command '/usr/bin/gcc' failed with exit code 1
#0 934.7     Running setup.py install for apex: finished with status 'error'
#0 934.7 ERROR: Command errored out with exit status 1: /opt/conda/bin/python -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-req-build-hi5ozq1r/setup.py'"'"'; __file__='"'"'/tmp/pip-req-build-hi5ozq1r/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' --cpp_ext --cuda_ext --fast_layer_norm install --record /tmp/pip-record-5zinxil4/install-record.txt --single-version-externally-managed --compile --install-headers /opt/conda/include/python3.9/apex Check the logs for full command output.
------
Dockerfile:12
--------------------
  11 |     # install apex
  12 | >>> RUN git clone https://github.com/NVIDIA/apex && \
  13 | >>>     cd apex && \
  14 | >>>     pip install packaging && \
  15 | >>>     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
  16 |     
--------------------
ERROR: failed to solve: process "/bin/sh -c git clone https://github.com/NVIDIA/apex &&     cd apex &&     pip install packaging &&     pip install -v --disable-pip-version-check --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" --global-option=\"--fast_layer_norm\" ./" did not complete successfully: exit code: 1

This issue can be resolved by pinning the apex commit number.

After fixing this issue, I was still unable to import colossalai with error:

In [1]: import colossalai
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-4afa607b1f63> in <module>
----> 1 import colossalai

~/anaconda3/lib/python3.9/site-packages/colossalai/__init__.py in <module>
----> 1 from .initialize import (
      2     get_default_parser,
      3     initialize,
      4     launch,
      5     launch_from_openmpi,

~/anaconda3/lib/python3.9/site-packages/colossalai/initialize.py in <module>
     16 from torch.utils.data import DataLoader
     17 
---> 18 from colossalai.amp import AMP_TYPE, convert_to_amp
     19 from colossalai.amp.naive_amp import NaiveAMPModel
     20 from colossalai.builder.builder import build_gradient_handler

~/anaconda3/lib/python3.9/site-packages/colossalai/amp/__init__.py in <module>
      9 
     10 from .amp_type import AMP_TYPE
---> 11 from .apex_amp import convert_to_apex_amp
     12 from .naive_amp import convert_to_naive_amp
     13 from .torch_amp import convert_to_torch_amp

~/anaconda3/lib/python3.9/site-packages/colossalai/amp/apex_amp/__init__.py in <module>
      2 from torch.optim import Optimizer
      3 
----> 4 from .apex_amp import ApexAMPOptimizer
      5 
      6 

~/anaconda3/lib/python3.9/site-packages/colossalai/amp/apex_amp/apex_amp.py in <module>
     11 from torch import Tensor
     12 
---> 13 from colossalai.nn.optimizer import ColossalaiOptimizer
     14 from colossalai.utils import clip_grad_norm_fp32
     15 

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/__init__.py in <module>
----> 1 from ._ops import *
      2 from .layer import *
      3 from .loss import *
      4 from .lr_scheduler import *
      5 from .metric import *

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/_ops/__init__.py in <module>
----> 1 from .addmm import colo_addmm
      2 from .batch_norm import colo_batch_norm
      3 from .element_wise import *
      4 from .embedding import colo_embedding
      5 from .embedding_bag import colo_embedding_bag

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/_ops/addmm.py in <module>
      4 from colossalai.tensor.op_wrapper import colo_op_impl
      5 
----> 6 from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
      7 
      8 

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/_ops/_utils.py in <module>
      5 
      6 from colossalai.global_variables import tensor_parallel_env as env
----> 7 from colossalai.nn.layer.utils import divide
      8 from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
      9 

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/__init__.py in <module>
      5 from .parallel_3d import *
      6 from .parallel_sequence import *
----> 7 from .moe import *
      8 from .utils import *
      9 from .vanilla import *

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/moe/__init__.py in <module>
----> 1 from .checkpoint import load_moe_model, save_moe_model
      2 from .experts import Experts, FFNExperts, TPExperts
      3 from .layers import MoeLayer, MoeModule
      4 from .routers import MoeRouter, Top1Router, Top2Router
      5 from .utils import NormalNoiseGenerator, UniformNoiseGenerator, build_ffn_experts

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/moe/checkpoint.py in <module>
      3 import torch.nn as nn
      4 
----> 5 from .experts import MoeExperts
      6 
      7 

~/anaconda3/lib/python3.9/site-packages/colossalai/nn/layer/moe/experts.py in <module>
     10 from colossalai.context.moe_context import MOE_CONTEXT
     11 from colossalai.utils import get_current_device
---> 12 from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
     13 
     14 

~/anaconda3/lib/python3.9/site-packages/colossalai/zero/__init__.py in <module>
----> 1 from .gemini import (
      2     ColoInitContext,
      3     GeminiAdamOptimizer,
      4     GeminiDDP,
      5     ZeroDDP,

~/anaconda3/lib/python3.9/site-packages/colossalai/zero/gemini/__init__.py in <module>
      1 from .chunk import ChunkManager, TensorInfo, TensorState, search_chunk_configuration
      2 from .colo_init_context import ColoInitContext, post_process_colo_init_ctx
----> 3 from .gemini_ddp import GeminiDDP, ZeroDDP
      4 from .gemini_mgr import GeminiManager
      5 from .gemini_optimizer import GeminiAdamOptimizer, ZeroOptimizer

~/anaconda3/lib/python3.9/site-packages/colossalai/zero/gemini/gemini_ddp.py in <module>
     17 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
     18 from colossalai.utils import get_current_device, is_ddp_ignored
---> 19 from colossalai.utils.model.experimental import LazyTensor
     20 
     21 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager

~/anaconda3/lib/python3.9/site-packages/colossalai/utils/model/experimental.py in <module>
      8 from torch.utils._pytree import tree_map
      9 
---> 10 from colossalai._analyzer._subclasses import MetaTensor
     11 from colossalai.tensor.d_tensor.d_tensor import DTensor
     12 from colossalai.tensor.d_tensor.layout import Layout

ModuleNotFoundError: No module named 'colossalai._analyzer'

I think this is due to missing init.py in colossalai/_analyzer.

Environment

colossalai: master

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions